This commit is contained in:
Jay D Dee
2017-12-08 15:39:28 -05:00
parent 4b57ac0eb9
commit af1c940919
53 changed files with 1324 additions and 4790 deletions

View File

@@ -9,18 +9,18 @@
void blakehash_4way(void *state, const void *input)
{
uint32_t hash0[16] __attribute__ ((aligned (64)));
uint32_t hash1[16] __attribute__ ((aligned (64)));
uint32_t hash2[16] __attribute__ ((aligned (64)));
uint32_t hash3[16] __attribute__ ((aligned (64)));
uint32_t vhash[16*4] __attribute__ ((aligned (64)));
uint32_t vhash[4*4] __attribute__ ((aligned (64)));
uint32_t hash0[4] __attribute__ ((aligned (32)));
uint32_t hash1[4] __attribute__ ((aligned (32)));
uint32_t hash2[4] __attribute__ ((aligned (32)));
uint32_t hash3[4] __attribute__ ((aligned (32)));
blake256_4way_context ctx;
blake256_4way_init( &ctx );
blake256_4way( &ctx, input, 16 );
blake256_4way_close( &ctx, vhash );
m128_deinterleave_4x32( hash0, hash1, hash2, hash3, vhash, 512 );
mm_deinterleave_4x32( hash0, hash1, hash2, hash3, vhash, 256 );
memcpy( state, hash0, 32 );
memcpy( state+32, hash1, 32 );
@@ -32,7 +32,7 @@ int scanhash_blake_4way( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done )
{
uint32_t vdata[20*4] __attribute__ ((aligned (64)));
uint32_t hash[4*8] __attribute__ ((aligned (64)));
uint32_t hash[4*4] __attribute__ ((aligned (32)));
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
const uint32_t first_nonce = pdata[19];
@@ -49,7 +49,7 @@ int scanhash_blake_4way( int thr_id, struct work *work, uint32_t max_nonce,
// we need big endian data...
swab32_array( endiandata, pdata, 20 );
m128_interleave_4x32( vdata, endiandata, endiandata, endiandata,
mm_interleave_4x32( vdata, endiandata, endiandata, endiandata,
endiandata, 640 );
uint32_t *noncep = vdata + 76; // 19*4

View File

@@ -13,11 +13,12 @@ bool register_blake_algo( algo_gate_t* gate )
// gate->scanhash = (void*)&scanhash_blake_8way;
// gate->hash = (void*)&blakehash_8way;
#if defined(BLAKE_4WAY)
gate->optimizations = SSE2_OPT | AVX_OPT;
four_way_not_tested();
gate->optimizations = FOUR_WAY_OPT;
gate->scanhash = (void*)&scanhash_blake_4way;
gate->hash = (void*)&blakehash_4way;
four_way_not_tested();
#else
gate->optimizations = SSE2_OPT;
gate->scanhash = (void*)&scanhash_blake;
gate->hash = (void*)&blakehash;
#endif

View File

@@ -536,22 +536,22 @@ do { \
, _mmset_epi32( CS6, CS6, CS6, CS6 ) ); \
VF = _mm_xor_si128( _mmset_epi32( T1, T1, T1, T1 ), \
_mmset_epi32( CS7, CS7, CS7, CS7 ) ); \
M[0x0] = mm_byteswap_epi32( *(buf + 0) ); \
M[0x1] = mm_byteswap_epi32( *(buf + 1) ); \
M[0x2] = mm_byteswap_epi32( *(buf + 2) ); \
M[0x3] = mm_byteswap_epi32( *(buf + 3) ); \
M[0x4] = mm_byteswap_epi32( *(buf + 4) ); \
M[0x5] = mm_byteswap_epi32( *(buf + 5) ); \
M[0x6] = mm_byteswap_epi32( *(buf + 6) ); \
M[0x7] = mm_byteswap_epi32( *(buf + 7) ); \
M[0x8] = mm_byteswap_epi32( *(buf + 8) ); \
M[0x9] = mm_byteswap_epi32( *(buf + 9) ); \
M[0xA] = mm_byteswap_epi32( *(buf + 10) ); \
M[0xB] = mm_byteswap_epi32( *(buf + 11) ); \
M[0xC] = mm_byteswap_epi32( *(buf + 12) ); \
M[0xD] = mm_byteswap_epi32( *(buf + 13) ); \
M[0xE] = mm_byteswap_epi32( *(buf + 14) ); \
M[0xF] = mm_byteswap_epi32( *(buf + 15) ); \
M[0x0] = mm_byteswap_32( *(buf + 0) ); \
M[0x1] = mm_byteswap_32( *(buf + 1) ); \
M[0x2] = mm_byteswap_32( *(buf + 2) ); \
M[0x3] = mm_byteswap_32( *(buf + 3) ); \
M[0x4] = mm_byteswap_32( *(buf + 4) ); \
M[0x5] = mm_byteswap_32( *(buf + 5) ); \
M[0x6] = mm_byteswap_32( *(buf + 6) ); \
M[0x7] = mm_byteswap_32( *(buf + 7) ); \
M[0x8] = mm_byteswap_32( *(buf + 8) ); \
M[0x9] = mm_byteswap_32( *(buf + 9) ); \
M[0xA] = mm_byteswap_32( *(buf + 10) ); \
M[0xB] = mm_byteswap_32( *(buf + 11) ); \
M[0xC] = mm_byteswap_32( *(buf + 12) ); \
M[0xD] = mm_byteswap_32( *(buf + 13) ); \
M[0xE] = mm_byteswap_32( *(buf + 14) ); \
M[0xF] = mm_byteswap_32( *(buf + 15) ); \
for (r = 0; r < BLAKE32_ROUNDS; r ++) \
ROUND_S_4WAY(r); \
H0 = _mm_xor_si128( _mm_xor_si128( \
@@ -601,22 +601,22 @@ do { \
_mm_set_epi32( CS6, CS6, CS6, CS6 ) ); \
VF = _mm_xor_si128( _mm_set_epi32( T1, T1, T1, T1 ), \
_mm_set_epi32( CS7, CS7, CS7, CS7 ) ); \
M0 = mm_byteswap_epi32( * buf ); \
M1 = mm_byteswap_epi32( *(buf+1) ); \
M2 = mm_byteswap_epi32( *(buf+2) ); \
M3 = mm_byteswap_epi32( *(buf+3) ); \
M4 = mm_byteswap_epi32( *(buf+4) ); \
M5 = mm_byteswap_epi32( *(buf+5) ); \
M6 = mm_byteswap_epi32( *(buf+6) ); \
M7 = mm_byteswap_epi32( *(buf+7) ); \
M8 = mm_byteswap_epi32( *(buf+8) ); \
M9 = mm_byteswap_epi32( *(buf+9) ); \
MA = mm_byteswap_epi32( *(buf+10) ); \
MB = mm_byteswap_epi32( *(buf+11) ); \
MC = mm_byteswap_epi32( *(buf+12) ); \
MD = mm_byteswap_epi32( *(buf+13) ); \
ME = mm_byteswap_epi32( *(buf+14) ); \
MF = mm_byteswap_epi32( *(buf+15) ); \
M0 = mm_byteswap_32( * buf ); \
M1 = mm_byteswap_32( *(buf+1) ); \
M2 = mm_byteswap_32( *(buf+2) ); \
M3 = mm_byteswap_32( *(buf+3) ); \
M4 = mm_byteswap_32( *(buf+4) ); \
M5 = mm_byteswap_32( *(buf+5) ); \
M6 = mm_byteswap_32( *(buf+6) ); \
M7 = mm_byteswap_32( *(buf+7) ); \
M8 = mm_byteswap_32( *(buf+8) ); \
M9 = mm_byteswap_32( *(buf+9) ); \
MA = mm_byteswap_32( *(buf+10) ); \
MB = mm_byteswap_32( *(buf+11) ); \
MC = mm_byteswap_32( *(buf+12) ); \
MD = mm_byteswap_32( *(buf+13) ); \
ME = mm_byteswap_32( *(buf+14) ); \
MF = mm_byteswap_32( *(buf+15) ); \
ROUND_S_4WAY(0); \
ROUND_S_4WAY(1); \
ROUND_S_4WAY(2); \
@@ -722,22 +722,22 @@ do { \
_mm256_set256_epi64( CB6, CB6, CB6, CB6 ) ); \
VF = _mm256_xor_si256( _mm256_set_epi64( T1, T1, T1, T1 ), \
_mm256_set256_epi64( CB7, CB7, CB7, CB7 ) ); \
M[0x0] = mm256_byteswap_epi64( *(buf+0) ); \
M[0x1] = mm256_byteswap_epi64( *(buf+1) ); \
M[0x2] = mm256_byteswap_epi64( *(buf+2) ); \
M[0x3] = mm256_byteswap_epi64( *(buf+3) ); \
M[0x4] = mm256_byteswap_epi64( *(buf+4) ); \
M[0x5] = mm256_byteswap_epi64( *(buf+5) ); \
M[0x6] = mm256_byteswap_epi64( *(buf+6) ); \
M[0x7] = mm256_byteswap_epi64( *(buf+7) ); \
M[0x8] = mm256_byteswap_epi64( *(buf+8) ); \
M[0x9] = mm256_byteswap_epi64( *(buf+9) ); \
M[0xA] = mm256_byteswap_epi64( *(buf+10) ); \
M[0xB] = mm256_byteswap_epi64( *(buf+11) ); \
M[0xC] = mm256_byteswap_epi64( *(buf+12) ); \
M[0xD] = mm256_byteswap_epi64( *(buf+13) ); \
M[0xE] = mm256_byteswap_epi64( *(buf+14) ); \
M[0xF] = mm256_byteswap_epi64( *(buf+15) ); \
M[0x0] = mm256_byteswap_64( *(buf+0) ); \
M[0x1] = mm256_byteswap_64( *(buf+1) ); \
M[0x2] = mm256_byteswap_64( *(buf+2) ); \
M[0x3] = mm256_byteswap_64( *(buf+3) ); \
M[0x4] = mm256_byteswap_64( *(buf+4) ); \
M[0x5] = mm256_byteswap_64( *(buf+5) ); \
M[0x6] = mm256_byteswap_64( *(buf+6) ); \
M[0x7] = mm256_byteswap_64( *(buf+7) ); \
M[0x8] = mm256_byteswap_64( *(buf+8) ); \
M[0x9] = mm256_byteswap_64( *(buf+9) ); \
M[0xA] = mm256_byteswap_64( *(buf+10) ); \
M[0xB] = mm256_byteswap_64( *(buf+11) ); \
M[0xC] = mm256_byteswap_64( *(buf+12) ); \
M[0xD] = mm256_byteswap_64( *(buf+13) ); \
M[0xE] = mm256_byteswap_64( *(buf+14) ); \
M[0xF] = mm256_byteswap_64( *(buf+15) ); \
for (r = 0; r < 16; r ++) \
ROUND_B_4WAY(r); \
H0 = _mm256_xor_si256( _mm256_xor_si256( \
@@ -787,22 +787,22 @@ do { \
_mm256_set_epi64x( CB6, CB6, CB6, CB6 ) ); \
VF = _mm256_xor_si256( _mm256_set_epi64x( T1, T1, T1, T1 ), \
_mm256_set_epi64x( CB7, CB7, CB7, CB7 ) ); \
M0 = mm256_byteswap_epi64( *(buf + 0) ); \
M1 = mm256_byteswap_epi64( *(buf + 1) ); \
M2 = mm256_byteswap_epi64( *(buf + 2) ); \
M3 = mm256_byteswap_epi64( *(buf + 3) ); \
M4 = mm256_byteswap_epi64( *(buf + 4) ); \
M5 = mm256_byteswap_epi64( *(buf + 5) ); \
M6 = mm256_byteswap_epi64( *(buf + 6) ); \
M7 = mm256_byteswap_epi64( *(buf + 7) ); \
M8 = mm256_byteswap_epi64( *(buf + 8) ); \
M9 = mm256_byteswap_epi64( *(buf + 9) ); \
MA = mm256_byteswap_epi64( *(buf + 10) ); \
MB = mm256_byteswap_epi64( *(buf + 11) ); \
MC = mm256_byteswap_epi64( *(buf + 12) ); \
MD = mm256_byteswap_epi64( *(buf + 13) ); \
ME = mm256_byteswap_epi64( *(buf + 14) ); \
MF = mm256_byteswap_epi64( *(buf + 15) ); \
M0 = mm256_byteswap_64( *(buf + 0) ); \
M1 = mm256_byteswap_64( *(buf + 1) ); \
M2 = mm256_byteswap_64( *(buf + 2) ); \
M3 = mm256_byteswap_64( *(buf + 3) ); \
M4 = mm256_byteswap_64( *(buf + 4) ); \
M5 = mm256_byteswap_64( *(buf + 5) ); \
M6 = mm256_byteswap_64( *(buf + 6) ); \
M7 = mm256_byteswap_64( *(buf + 7) ); \
M8 = mm256_byteswap_64( *(buf + 8) ); \
M9 = mm256_byteswap_64( *(buf + 9) ); \
MA = mm256_byteswap_64( *(buf + 10) ); \
MB = mm256_byteswap_64( *(buf + 11) ); \
MC = mm256_byteswap_64( *(buf + 12) ); \
MD = mm256_byteswap_64( *(buf + 13) ); \
ME = mm256_byteswap_64( *(buf + 14) ); \
MF = mm256_byteswap_64( *(buf + 15) ); \
ROUND_B_4WAY(0); \
ROUND_B_4WAY(1); \
ROUND_B_4WAY(2); \
@@ -870,7 +870,7 @@ blake32_4way( blake_4way_small_context *sc, const void *data, size_t len )
if ( len < buf_size - ptr )
{
memcpy_m128i( buf + (ptr>>2), vdata, len>>2 );
memcpy_128( buf + (ptr>>2), vdata, len>>2 );
ptr += len;
sc->ptr = ptr;
return;
@@ -884,7 +884,7 @@ blake32_4way( blake_4way_small_context *sc, const void *data, size_t len )
clen = buf_size - ptr;
if (clen > len)
clen = len;
memcpy_m128i( buf + (ptr>>2), vdata, clen>>2 );
memcpy_128( buf + (ptr>>2), vdata, clen>>2 );
ptr += clen;
vdata += (clen>>2);
len -= clen;
@@ -936,32 +936,32 @@ blake32_4way_close( blake_4way_small_context *sc, unsigned ub, unsigned n,
if ( ptr <= 48 )
{
memset_zero_m128i( u.buf + (ptr>>2) + 1, (48 - ptr) >> 2 );
memset_zero_128( u.buf + (ptr>>2) + 1, (48 - ptr) >> 2 );
if (out_size_w32 == 8)
u.buf[52>>2] = _mm_or_si128( u.buf[52>>2],
_mm_set_epi32( 0x010000000, 0x01000000,
0x010000000, 0x01000000 ) );
*(u.buf+(56>>2)) = mm_byteswap_epi32( _mm_set_epi32( th, th, th, th ) );
*(u.buf+(60>>2)) = mm_byteswap_epi32( _mm_set_epi32( tl, tl, tl, tl ) );
*(u.buf+(56>>2)) = mm_byteswap_32( _mm_set_epi32( th, th, th, th ) );
*(u.buf+(60>>2)) = mm_byteswap_32( _mm_set_epi32( tl, tl, tl, tl ) );
blake32_4way( sc, u.buf + (ptr>>2), 64 - ptr );
}
else
{
memset_zero_m128i( u.buf + (ptr>>2) + 1, (60-ptr) >> 2 );
memset_zero_128( u.buf + (ptr>>2) + 1, (60-ptr) >> 2 );
blake32_4way( sc, u.buf + (ptr>>2), 64 - ptr );
sc->T0 = SPH_C32(0xFFFFFE00);
sc->T1 = SPH_C32(0xFFFFFFFF);
memset_zero_m128i( u.buf, 56>>2 );
memset_zero_128( u.buf, 56>>2 );
if (out_size_w32 == 8)
u.buf[52>>2] = _mm_set_epi32( 0x010000000, 0x01000000,
0x010000000, 0x01000000 );
*(u.buf+(56>>2)) = mm_byteswap_epi32( _mm_set_epi32( th, th, th, th ) );
*(u.buf+(60>>2)) = mm_byteswap_epi32( _mm_set_epi32( tl, tl, tl, tl ) );
*(u.buf+(56>>2)) = mm_byteswap_32( _mm_set_epi32( th, th, th, th ) );
*(u.buf+(60>>2)) = mm_byteswap_32( _mm_set_epi32( tl, tl, tl, tl ) );
blake32_4way( sc, u.buf, 64 );
}
out = (__m128i*)dst;
for ( k = 0; k < out_size_w32; k++ )
out[k] = mm_byteswap_epi32( sc->H[k] );
out[k] = mm_byteswap_32( sc->H[k] );
}
#if defined (__AVX2__)
@@ -995,7 +995,7 @@ blake64_4way( blake_4way_big_context *sc, const void *data, size_t len)
ptr = sc->ptr;
if ( len < (buf_size - ptr) )
{
memcpy_m256i( buf + (ptr>>3), vdata, len>>3 );
memcpy_256( buf + (ptr>>3), vdata, len>>3 );
ptr += len;
sc->ptr = ptr;
return;
@@ -1009,7 +1009,7 @@ blake64_4way( blake_4way_big_context *sc, const void *data, size_t len)
clen = buf_size - ptr;
if ( clen > len )
clen = len;
memcpy_m256i( buf + (ptr>>3), vdata, clen>>3 );
memcpy_256( buf + (ptr>>3), vdata, clen>>3 );
ptr += clen;
vdata = vdata + (clen>>3);
len -= clen;
@@ -1062,44 +1062,44 @@ blake64_4way_close( blake_4way_big_context *sc,
}
if ( ptr <= 104 )
{
memset_zero_m256i( u.buf + (ptr>>3) + 1, (104-ptr) >> 3 );
memset_zero_256( u.buf + (ptr>>3) + 1, (104-ptr) >> 3 );
if ( out_size_w64 == 8 )
u.buf[(104>>3)] = _mm256_or_si256( u.buf[(104>>3)],
_mm256_set_epi64x( 0x0100000000000000,
0x0100000000000000,
0x0100000000000000,
0x0100000000000000 ) );
*(u.buf+(112>>3)) = mm256_byteswap_epi64(
*(u.buf+(112>>3)) = mm256_byteswap_64(
_mm256_set_epi64x( th, th, th, th ) );
*(u.buf+(120>>3)) = mm256_byteswap_epi64(
*(u.buf+(120>>3)) = mm256_byteswap_64(
_mm256_set_epi64x( tl, tl, tl, tl ) );
blake64_4way( sc, u.buf + (ptr>>3), 128 - ptr );
}
else
{
memset_zero_m256i( u.buf + (ptr>>3) + 1, (120 - ptr) >> 3 );
memset_zero_256( u.buf + (ptr>>3) + 1, (120 - ptr) >> 3 );
blake64_4way( sc, u.buf + (ptr>>3), 128 - ptr );
sc->T0 = SPH_C64(0xFFFFFFFFFFFFFC00);
sc->T1 = SPH_C64(0xFFFFFFFFFFFFFFFF);
memset_zero_m256i( u.buf, 112>>3 );
memset_zero_256( u.buf, 112>>3 );
if ( out_size_w64 == 8 )
u.buf[104>>3] = _mm256_set_epi64x( 0x0100000000000000,
0x0100000000000000,
0x0100000000000000,
0x0100000000000000 );
*(u.buf+(112>>3)) = mm256_byteswap_epi64(
*(u.buf+(112>>3)) = mm256_byteswap_64(
_mm256_set_epi64x( th, th, th, th ) );
*(u.buf+(120>>3)) = mm256_byteswap_epi64(
*(u.buf+(120>>3)) = mm256_byteswap_64(
_mm256_set_epi64x( tl, tl, tl, tl ) );
blake64_4way( sc, u.buf, 128 );
}
out = (__m256i*)dst;
for ( k = 0; k < out_size_w64; k++ )
out[k] = mm256_byteswap_epi64( sc->H[k] );
out[k] = mm256_byteswap_64( sc->H[k] );
}
#endif

View File

@@ -13,17 +13,17 @@ static __thread bool ctx_midstate_done = false;
void decred_hash_4way( void *state, const void *input )
{
uint32_t hash0[16] __attribute__ ((aligned (64)));
uint32_t hash1[16] __attribute__ ((aligned (64)));
uint32_t hash2[16] __attribute__ ((aligned (64)));
uint32_t hash3[16] __attribute__ ((aligned (64)));
uint32_t vhash[16*4] __attribute__ ((aligned (64)));
uint32_t vhash[4*4] __attribute__ ((aligned (64)));
uint32_t hash0[4] __attribute__ ((aligned (32)));
uint32_t hash1[4] __attribute__ ((aligned (32)));
uint32_t hash2[4] __attribute__ ((aligned (32)));
uint32_t hash3[4] __attribute__ ((aligned (32)));
blake256_4way_context ctx __attribute__ ((aligned (64)));
sph_blake256_context ctx2 __attribute__ ((aligned (64)));
uint32_t hash[16] __attribute__ ((aligned (64)));
uint32_t sin0[45], sin1[45], sin2[45], sin3[45];
m128_deinterleave_4x32( sin0, sin1, sin2, sin3, (uint32_t*)input, 180*8 );
mm_deinterleave_4x32x( sin0, sin1, sin2, sin3, input, 180*8 );
void *tail = input + DECRED_MIDSTATE_LEN;
int tail_len = 180 - DECRED_MIDSTATE_LEN;
@@ -53,7 +53,7 @@ void decred_hash_4way( void *state, const void *input )
blake256_4way( &ctx, input, 180 );
blake256_4way_close( &ctx, vhash );
m128_deinterleave_4x32( hash0, hash1, hash2, hash3, vhash, 512 );
mm_deinterleave_4x32( hash0, hash1, hash2, hash3, vhash, 256 );
/*
for ( int i = 0; i < 8; i++ )
if ( hash[i] != hash0[i] )
@@ -79,7 +79,7 @@ int scanhash_decred_4way( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done)
{
uint32_t vdata[45*4] __attribute__ ((aligned (64)));
uint32_t hash[4*8] __attribute__ ((aligned (64)));
uint32_t hash[4*4] __attribute__ ((aligned (32)));
uint32_t _ALIGN(64) endiandata[48];
// uint32_t _ALIGN(64) hash32[8];
uint32_t *pdata = work->data;
@@ -97,7 +97,8 @@ int scanhash_decred_4way( int thr_id, struct work *work, uint32_t max_nonce,
// memcpy(endiandata, pdata, 180);
m128_interleave_4x32( vdata, pdata, pdata, pdata, pdata, 180*8 );
// use the old way until new way updated for size.
mm_interleave_4x32x( vdata, pdata, pdata, pdata, pdata, 180*8 );
uint32_t *noncep = vdata + DECRED_NONCE_INDEX * 4;
do {

View File

@@ -144,7 +144,8 @@ bool decred_ready_to_mine( struct work* work, struct stratum_ctx* stratum,
bool register_decred_algo( algo_gate_t* gate )
{
#if defined(DECRED_4WAY)
gate->optimizations = SSE2_OPT | AVX_OPT;
four_way_not_tested();
gate->optimizations = FOUR_WAY_OPT;
gate->scanhash = (void*)&scanhash_decred_4way;
gate->hash = (void*)&decred_hash_4way;
#else
@@ -153,9 +154,6 @@ bool register_decred_algo( algo_gate_t* gate )
gate->hash = (void*)&decred_hash;
#endif
// gate->optimizations = SSE2_OPT;
// gate->scanhash = (void*)&scanhash_decred;
// gate->hash = (void*)&decred_hash;
gate->get_nonceptr = (void*)&decred_get_nonceptr;
gate->get_max64 = (void*)&get_max64_0x3fffffLL;
gate->display_extra_data = (void*)&decred_decode_extradata;

View File

@@ -30,13 +30,13 @@ extern void pentablakehash_4way( void *output, const void *input )
blake512_4way_close( &ctx, vhash );
uint64_t sin0[10], sin1[10], sin2[10], sin3[10];
m256_deinterleave_4x64( sin0, sin1, sin2, sin3, input, 640 );
mm256_deinterleave_4x64( sin0, sin1, sin2, sin3, input, 640 );
sph_blake512_context ctx2_blake;
sph_blake512_init(&ctx2_blake);
sph_blake512(&ctx2_blake, sin0, 80);
sph_blake512_close(&ctx2_blake, (void*) hash);
m256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
uint64_t* hash64 = (uint64_t*)hash;
for( int i = 0; i < 8; i++ )
{
@@ -60,7 +60,7 @@ for( int i = 0; i < 8; i++ )
blake512_4way( &ctx, vhash, 64 );
blake512_4way_close( &ctx, vhash );
m256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
memcpy( output, hash0, 32 );
memcpy( output+32, hash1, 32 );
memcpy( output+64, hash2, 32 );
@@ -141,7 +141,7 @@ int scanhash_pentablake_4way( int thr_id, struct work *work,
swab32_array( endiandata, pdata, 20 );
uint64_t *edata = (uint64_t*)endiandata;
m256_interleave_4x64( (uint64_t*)vdata, edata, edata, edata, edata, 640 );
mm256_interleave_4x64( (uint64_t*)vdata, edata, edata, edata, edata, 640 );
for ( int m=0; m < 6; m++ )
{

View File

@@ -3,13 +3,13 @@
bool register_pentablake_algo( algo_gate_t* gate )
{
#if defined (PENTABLAKE_4WAY)
gate->optimizations = SSE2_OPT | AVX2_OPT;
gate->scanhash = (void*)&scanhash_pentablake_4way;
gate->hash = (void*)&pentablakehash_4way;
#else
gate->scanhash = (void*)&scanhash_pentablake;
gate->hash = (void*)&pentablakehash;
#endif
gate->optimizations = FOUR_WAY_OPT;
gate->get_max64 = (void*)&get_max64_0x3ffff;
return true;
};