mirror of
https://github.com/JayDDee/cpuminer-opt.git
synced 2025-09-17 23:44:27 +00:00
v3.23.3
This commit is contained in:
@@ -6,13 +6,12 @@
|
||||
#include "sha256-hash.h"
|
||||
#include "compat.h"
|
||||
|
||||
/*
|
||||
static const uint32_t H256[8] =
|
||||
static const uint32_t sha256_iv[8] __attribute__ ((aligned (32))) =
|
||||
{
|
||||
0x6A09E667, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A,
|
||||
0x510E527F, 0x9B05688C, 0x1F83D9AB, 0x5BE0CD19
|
||||
};
|
||||
*/
|
||||
|
||||
|
||||
static const uint32_t K256[64] =
|
||||
{
|
||||
@@ -83,7 +82,7 @@ static const uint32_t K256[64] =
|
||||
#define SHA2s_4WAY_STEP(A, B, C, D, E, F, G, H, i, j) \
|
||||
do { \
|
||||
__m128i T1, T2; \
|
||||
__m128i K = _mm_set1_epi32( K256[( (j)+(i) )] ); \
|
||||
__m128i K = v128_32( K256[( (j)+(i) )] ); \
|
||||
T1 = _mm_add_epi32( H, mm128_add4_32( BSG2_1(E), CHs(E, F, G), \
|
||||
K, W[i] ) ); \
|
||||
T2 = _mm_add_epi32( BSG2_0(A), MAJs(A, B, C) ); \
|
||||
@@ -358,19 +357,19 @@ int sha256_4way_transform_le_short( __m128i *state_out, const __m128i *data,
|
||||
SHA2s_4WAY_STEP( A, B, C, D, E, F, G, H, 8, 48 );
|
||||
SHA2s_4WAY_STEP( H, A, B, C, D, E, F, G, 9, 48 );
|
||||
|
||||
T0 = _mm_add_epi32( _mm_set1_epi32( K256[58] ),
|
||||
T0 = _mm_add_epi32( v128_32( K256[58] ),
|
||||
mm128_add4_32( BSG2_1( C ), CHs( C, D, E ), W[10], F ) );
|
||||
B = _mm_add_epi32( B, T0 );
|
||||
|
||||
T1 = _mm_add_epi32( _mm_set1_epi32( K256[59] ),
|
||||
T1 = _mm_add_epi32( v128_32( K256[59] ),
|
||||
mm128_add4_32( BSG2_1( B ), CHs( B, C, D ), W[11], E ) );
|
||||
A = _mm_add_epi32( A, T1 );
|
||||
|
||||
T2 = _mm_add_epi32( _mm_set1_epi32( K256[60] ),
|
||||
T2 = _mm_add_epi32( v128_32( K256[60] ),
|
||||
mm128_add4_32( BSG2_1( A ), CHs( A, B, C ), W[12], D ) );
|
||||
H = _mm_add_epi32( H, T2 );
|
||||
|
||||
targ = _mm_set1_epi32( target[7] );
|
||||
targ = v128_32( target[7] );
|
||||
hash = mm128_bswap_32( _mm_add_epi32( H, IV7 ) );
|
||||
|
||||
flip = ( (int)target[7] < 0 ? 0xf : 0 ) ^ mm128_movmask_32( hash );
|
||||
@@ -386,13 +385,13 @@ int sha256_4way_transform_le_short( __m128i *state_out, const __m128i *data,
|
||||
|
||||
// round 61 part 1
|
||||
W[13] = SHA2s_MEXP( W[11], W[ 6], W[14], W[13] );
|
||||
T0 = _mm_add_epi32( _mm_set1_epi32( K256[61] ),
|
||||
T0 = _mm_add_epi32( v128_32( K256[61] ),
|
||||
mm128_add4_32( BSG2_1( H ), CHs( H, A, B ), W[13], C ) );
|
||||
G = _mm_add_epi32( G, T0 );
|
||||
|
||||
if ( t6_mask )
|
||||
{
|
||||
targ = _mm_and_si128( vmask, _mm_set1_epi32( target[6] ) );
|
||||
targ = _mm_and_si128( vmask, v128_32( target[6] ) );
|
||||
hash = mm128_bswap_32( _mm_add_epi32( G, IV6 ) );
|
||||
|
||||
if ( ( 0 != ( t6_mask & mm128_movmask_32(
|
||||
@@ -440,14 +439,14 @@ return 1;
|
||||
void sha256_4way_init( sha256_4way_context *sc )
|
||||
{
|
||||
sc->count_high = sc->count_low = 0;
|
||||
sc->val[0] = _mm_set1_epi64x( 0x6A09E6676A09E667 );
|
||||
sc->val[1] = _mm_set1_epi64x( 0xBB67AE85BB67AE85 );
|
||||
sc->val[2] = _mm_set1_epi64x( 0x3C6EF3723C6EF372 );
|
||||
sc->val[3] = _mm_set1_epi64x( 0xA54FF53AA54FF53A );
|
||||
sc->val[4] = _mm_set1_epi64x( 0x510E527F510E527F );
|
||||
sc->val[5] = _mm_set1_epi64x( 0x9B05688C9B05688C );
|
||||
sc->val[6] = _mm_set1_epi64x( 0x1F83D9AB1F83D9AB );
|
||||
sc->val[7] = _mm_set1_epi64x( 0x5BE0CD195BE0CD19 );
|
||||
sc->val[0] = v128_32( sha256_iv[0] );
|
||||
sc->val[1] = v128_32( sha256_iv[1] );
|
||||
sc->val[2] = v128_32( sha256_iv[2] );
|
||||
sc->val[3] = v128_32( sha256_iv[3] );
|
||||
sc->val[4] = v128_32( sha256_iv[4] );
|
||||
sc->val[5] = v128_32( sha256_iv[5] );
|
||||
sc->val[6] = v128_32( sha256_iv[6] );
|
||||
sc->val[7] = v128_32( sha256_iv[7] );
|
||||
}
|
||||
|
||||
void sha256_4way_update( sha256_4way_context *sc, const void *data, size_t len )
|
||||
@@ -490,7 +489,7 @@ void sha256_4way_close( sha256_4way_context *sc, void *dst )
|
||||
const int pad = buf_size - 8;
|
||||
|
||||
ptr = (unsigned)sc->count_low & (buf_size - 1U);
|
||||
sc->buf[ ptr>>2 ] = _mm_set1_epi64x( 0x0000008000000080 );
|
||||
sc->buf[ ptr>>2 ] = v128_64( 0x0000008000000080 );
|
||||
ptr += 4;
|
||||
|
||||
if ( ptr > pad )
|
||||
@@ -506,8 +505,8 @@ void sha256_4way_close( sha256_4way_context *sc, void *dst )
|
||||
high = (sc->count_high << 3) | (low >> 29);
|
||||
low = low << 3;
|
||||
|
||||
sc->buf[ pad >> 2 ] = _mm_set1_epi32( bswap_32( high ) );
|
||||
sc->buf[( pad+4 ) >> 2 ] = _mm_set1_epi32( bswap_32( low ) );
|
||||
sc->buf[ pad >> 2 ] = v128_32( bswap_32( high ) );
|
||||
sc->buf[( pad+4 ) >> 2 ] = v128_32( bswap_32( low ) );
|
||||
sha256_4way_transform_be( sc->val, sc->buf, sc->val );
|
||||
|
||||
mm128_block_bswap_32( dst, sc->val );
|
||||
@@ -580,7 +579,7 @@ void sha256_4way_full( void *dst, const void *data, size_t len )
|
||||
|
||||
#define SHA256_8WAY_ROUND( A, B, C, D, E, F, G, H, i, j ) \
|
||||
do { \
|
||||
__m256i T0 = _mm256_add_epi32( _mm256_set1_epi32( K256[ (j)+(i) ] ), \
|
||||
__m256i T0 = _mm256_add_epi32( v256_32( K256[ (j)+(i) ] ), \
|
||||
W[ i ] ); \
|
||||
__m256i T1 = BSG2_1x( E ); \
|
||||
__m256i T2 = BSG2_0x( A ); \
|
||||
@@ -614,7 +613,7 @@ do { \
|
||||
#define SHA256_8WAY_ROUND_NOMSG( A, B, C, D, E, F, G, H, i, j ) \
|
||||
{ \
|
||||
__m256i T1 = mm256_add4_32( H, BSG2_1x(E), CHx(E, F, G), \
|
||||
_mm256_set1_epi32( K256[(i)+(j)] ) ); \
|
||||
v256_32( K256[(i)+(j)] ) ); \
|
||||
__m256i T2 = _mm256_add_epi32( BSG2_0x(A), MAJx(A, B, C) ); \
|
||||
D = _mm256_add_epi32( D, T1 ); \
|
||||
H = _mm256_add_epi32( T1, T2 ); \
|
||||
@@ -634,7 +633,7 @@ do { \
|
||||
#define SHA256_8WAY_ROUND_NOMSG( A, B, C, D, E, F, G, H, i, j ) \
|
||||
{ \
|
||||
__m256i T1 = mm256_add4_32( H, BSG2_1x(E), CHx(E, F, G), \
|
||||
_mm256_set1_epi32( K256[(i)+(j)] ) ); \
|
||||
v256_32( K256[(i)+(j)] ) ); \
|
||||
__m256i T2 = _mm256_add_epi32( BSG2_0x(A), MAJx(A, B, C) ); \
|
||||
Y_xor_Z = X_xor_Y; \
|
||||
D = _mm256_add_epi32( D, T1 ); \
|
||||
@@ -643,7 +642,7 @@ do { \
|
||||
|
||||
#define SHA256_8WAY_ROUND( A, B, C, D, E, F, G, H, i, j ) \
|
||||
do { \
|
||||
__m256i T0 = _mm256_add_epi32( _mm256_set1_epi32( K256[(j)+(i)] ), W[i] ); \
|
||||
__m256i T0 = _mm256_add_epi32( v256_32( K256[(j)+(i)] ), W[i] ); \
|
||||
__m256i T1 = BSG2_1x( E ); \
|
||||
__m256i T2 = BSG2_0x( A ); \
|
||||
T0 = _mm256_add_epi32( T0, CHx( E, F, G ) ); \
|
||||
@@ -666,7 +665,7 @@ do { \
|
||||
|
||||
#define SHA256_8WAY_2ROUNDS( A, B, C, D, E, F, G, H, i0, i1, j ) \
|
||||
do { \
|
||||
__m256i T0 = _mm256_add_epi32( _mm256_set1_epi32( K256[ (j)+(i0) ] ), \
|
||||
__m256i T0 = _mm256_add_epi32( v256_32( K256[ (j)+(i0) ] ), \
|
||||
W[ i0 ] ); \
|
||||
__m256i T1 = BSG2_1x( E ); \
|
||||
__m256i T2 = BSG2_0x( A ); \
|
||||
@@ -677,7 +676,7 @@ do { \
|
||||
D = _mm256_add_epi32( D, T1 ); \
|
||||
H = _mm256_add_epi32( T1, T2 ); \
|
||||
\
|
||||
T0 = _mm256_add_epi32( _mm256_set1_epi32( K256[ (j)+(i1) ] ), \
|
||||
T0 = _mm256_add_epi32( v256_32( K256[ (j)+(i1) ] ), \
|
||||
W[ (i1) ] ); \
|
||||
T1 = BSG2_1x( D ); \
|
||||
T2 = BSG2_0x( H ); \
|
||||
@@ -790,7 +789,7 @@ void sha256_8way_prehash_3rounds( __m256i *state_mid, __m256i *X,
|
||||
|
||||
// round 3 part 1, avoid nonces W[3]
|
||||
T1 = mm256_add4_32( E, BSG2_1x(B), CHx(B, C, D),
|
||||
_mm256_set1_epi32( K256[3] ) );
|
||||
v256_32( K256[3] ) );
|
||||
A = _mm256_add_epi32( A, T1 );
|
||||
E = _mm256_add_epi32( T1, _mm256_add_epi32( BSG2_0x(F),
|
||||
MAJx(F, G, H) ) );
|
||||
@@ -910,12 +909,11 @@ int sha256_8way_transform_le_short( __m256i *state_out, const __m256i *data,
|
||||
const __m256i *state_in, const uint32_t *target )
|
||||
{
|
||||
__m256i A, B, C, D, E, F, G, H, T0, T1, T2;
|
||||
int flip;
|
||||
int t6_mask;
|
||||
__m256i vmask, targ, hash;
|
||||
__m256i W[16]; memcpy_256( W, data, 16 );
|
||||
const __m256i bswap_shuf = mm256_bcast_m128( _mm_set_epi64x(
|
||||
0x0c0d0e0f08090a0b, 0x0405060700010203 ) );
|
||||
uint8_t flip, t6_mask;
|
||||
|
||||
A = _mm256_load_si256( state_in );
|
||||
B = _mm256_load_si256( state_in+1 );
|
||||
@@ -991,26 +989,28 @@ int sha256_8way_transform_le_short( __m256i *state_out, const __m256i *data,
|
||||
SHA256_8WAY_ROUND( H, A, B, C, D, E, F, G, 9, 48 );
|
||||
|
||||
// round 58 to 60 part 1
|
||||
T0 = _mm256_add_epi32( _mm256_set1_epi32( K256[58] ),
|
||||
T0 = _mm256_add_epi32( v256_32( K256[58] ),
|
||||
mm256_add4_32( BSG2_1x( C ), CHx( C, D, E ), W[10], F ) );
|
||||
B = _mm256_add_epi32( B, T0 );
|
||||
|
||||
T1 = _mm256_add_epi32( _mm256_set1_epi32( K256[59] ),
|
||||
T1 = _mm256_add_epi32( v256_32( K256[59] ),
|
||||
mm256_add4_32( BSG2_1x( B ), CHx( B, C, D ), W[11], E ) );
|
||||
A = _mm256_add_epi32( A, T1 );
|
||||
|
||||
T2 = _mm256_add_epi32( _mm256_set1_epi32( K256[60] ),
|
||||
T2 = _mm256_add_epi32( v256_32( K256[60] ),
|
||||
mm256_add4_32( BSG2_1x( A ), CHx( A, B, C ), W[12], D ) );
|
||||
H = _mm256_add_epi32( H, T2 );
|
||||
|
||||
// Got H, test it.
|
||||
targ = _mm256_set1_epi32( target[7] );
|
||||
targ = v256_32( target[7] );
|
||||
hash = _mm256_shuffle_epi8( _mm256_add_epi32( H, IV7 ), bswap_shuf );
|
||||
flip = ( (int)target[7] < 0 ? -1 : 0 ) ^ mm256_movmask_32( hash );
|
||||
if ( likely( 0xff == ( flip ^
|
||||
if ( target[7] )
|
||||
{
|
||||
flip = ( (int)target[7] < 0 ? -1 : 0 ) ^ mm256_movmask_32( hash );
|
||||
if ( likely( 0xff == ( flip ^
|
||||
mm256_movmask_32( _mm256_cmpgt_epi32( hash, targ ) ) ) ))
|
||||
return 0;
|
||||
|
||||
return 0;
|
||||
}
|
||||
t6_mask = mm256_movmask_32( vmask =_mm256_cmpeq_epi32( hash, targ ) );
|
||||
|
||||
// round 58 part 2
|
||||
@@ -1018,14 +1018,14 @@ int sha256_8way_transform_le_short( __m256i *state_out, const __m256i *data,
|
||||
MAJx( G, H, A ) ) );
|
||||
// round 61 part 1
|
||||
W[13] = SHA256_8WAY_MEXP( W[11], W[ 6], W[14], W[13] );
|
||||
T0 = _mm256_add_epi32( _mm256_set1_epi32( K256[61] ),
|
||||
T0 = _mm256_add_epi32( v256_32( K256[61] ),
|
||||
mm256_add4_32( BSG2_1x( H ), CHx( H, A, B ), W[13], C ) );
|
||||
G = _mm256_add_epi32( G, T0 );
|
||||
|
||||
if ( t6_mask )
|
||||
{
|
||||
// Testing H was inconclusive: hash7 == target7, need to test G
|
||||
targ = _mm256_and_si256( vmask, _mm256_set1_epi32( target[6] ) );
|
||||
targ = _mm256_and_si256( vmask, v256_32( target[6] ) );
|
||||
hash = _mm256_shuffle_epi8( _mm256_add_epi32( G, IV6 ), bswap_shuf );
|
||||
|
||||
if ( likely( 0 == ( t6_mask & mm256_movmask_32(
|
||||
@@ -1078,14 +1078,14 @@ int sha256_8way_transform_le_short( __m256i *state_out, const __m256i *data,
|
||||
void sha256_8way_init( sha256_8way_context *sc )
|
||||
{
|
||||
sc->count_high = sc->count_low = 0;
|
||||
sc->val[0] = _mm256_set1_epi64x( 0x6A09E6676A09E667 );
|
||||
sc->val[1] = _mm256_set1_epi64x( 0xBB67AE85BB67AE85 );
|
||||
sc->val[2] = _mm256_set1_epi64x( 0x3C6EF3723C6EF372 );
|
||||
sc->val[3] = _mm256_set1_epi64x( 0xA54FF53AA54FF53A );
|
||||
sc->val[4] = _mm256_set1_epi64x( 0x510E527F510E527F );
|
||||
sc->val[5] = _mm256_set1_epi64x( 0x9B05688C9B05688C );
|
||||
sc->val[6] = _mm256_set1_epi64x( 0x1F83D9AB1F83D9AB );
|
||||
sc->val[7] = _mm256_set1_epi64x( 0x5BE0CD195BE0CD19 );
|
||||
sc->val[0] = v256_32( sha256_iv[0] );
|
||||
sc->val[1] = v256_32( sha256_iv[1] );
|
||||
sc->val[2] = v256_32( sha256_iv[2] );
|
||||
sc->val[3] = v256_32( sha256_iv[3] );
|
||||
sc->val[4] = v256_32( sha256_iv[4] );
|
||||
sc->val[5] = v256_32( sha256_iv[5] );
|
||||
sc->val[6] = v256_32( sha256_iv[6] );
|
||||
sc->val[7] = v256_32( sha256_iv[7] );
|
||||
}
|
||||
|
||||
// need to handle odd byte length for yespower.
|
||||
@@ -1131,7 +1131,7 @@ void sha256_8way_close( sha256_8way_context *sc, void *dst )
|
||||
const int pad = buf_size - 8;
|
||||
|
||||
ptr = (unsigned)sc->count_low & (buf_size - 1U);
|
||||
sc->buf[ ptr>>2 ] = _mm256_set1_epi64x( 0x0000008000000080 );
|
||||
sc->buf[ ptr>>2 ] = v256_64( 0x0000008000000080 );
|
||||
ptr += 4;
|
||||
|
||||
if ( ptr > pad )
|
||||
@@ -1147,8 +1147,8 @@ void sha256_8way_close( sha256_8way_context *sc, void *dst )
|
||||
high = (sc->count_high << 3) | (low >> 29);
|
||||
low = low << 3;
|
||||
|
||||
sc->buf[ pad >> 2 ] = _mm256_set1_epi32( bswap_32( high ) );
|
||||
sc->buf[ ( pad+4 ) >> 2 ] = _mm256_set1_epi32( bswap_32( low ) );
|
||||
sc->buf[ pad >> 2 ] = v256_32( bswap_32( high ) );
|
||||
sc->buf[ ( pad+4 ) >> 2 ] = v256_32( bswap_32( low ) );
|
||||
|
||||
sha256_8way_transform_be( sc->val, sc->buf, sc->val );
|
||||
|
||||
@@ -1210,7 +1210,7 @@ void sha256_8way_full( void *dst, const void *data, size_t len )
|
||||
|
||||
#define SHA256_16WAY_ROUND( A, B, C, D, E, F, G, H, i, j ) \
|
||||
do { \
|
||||
__m512i T0 = _mm512_add_epi32( _mm512_set1_epi32( K256[(j)+(i)] ), W[i] ); \
|
||||
__m512i T0 = _mm512_add_epi32( v512_32( K256[(j)+(i)] ), W[i] ); \
|
||||
__m512i T1 = BSG2_1x16( E ); \
|
||||
__m512i T2 = BSG2_0x16( A ); \
|
||||
T0 = _mm512_add_epi32( T0, CHx16( E, F, G ) ); \
|
||||
@@ -1224,7 +1224,7 @@ do { \
|
||||
#define SHA256_16WAY_ROUND_NOMSG( A, B, C, D, E, F, G, H, i, j ) \
|
||||
{ \
|
||||
__m512i T1 = mm512_add4_32( H, BSG2_1x16(E), CHx16(E, F, G), \
|
||||
_mm512_set1_epi32( K256[(i)+(j)] ) ); \
|
||||
v512_32( K256[(i)+(j)] ) ); \
|
||||
__m512i T2 = _mm512_add_epi32( BSG2_0x16(A), MAJx16(A, B, C) ); \
|
||||
D = _mm512_add_epi32( D, T1 ); \
|
||||
H = _mm512_add_epi32( T1, T2 ); \
|
||||
@@ -1234,7 +1234,7 @@ do { \
|
||||
#define SHA256_16WAY_ROUND(A, B, C, D, E, F, G, H, i, j) \
|
||||
do { \
|
||||
__m512i T1, T2; \
|
||||
__m512i K = _mm512_set1_epi32( K256[( (j)+(i) )] ); \
|
||||
__m512i K = v512_32( K256[( (j)+(i) )] ); \
|
||||
T1 = _mm512_add_epi32( H, mm512_add4_32( BSG2_1x16(E), CHx16(E, F, G), \
|
||||
K, W[i] ) ); \
|
||||
T2 = _mm512_add_epi32( BSG2_0x16(A), MAJx16(A, B, C) ); \
|
||||
@@ -1345,7 +1345,7 @@ void sha256_16way_prehash_3rounds( __m512i *state_mid, __m512i *X,
|
||||
|
||||
// round 3 part 1, avoid nonces W[3]
|
||||
T1 = mm512_add4_32( E, BSG2_1x16(B), CHx16(B, C, D),
|
||||
_mm512_set1_epi32( K256[3] ) );
|
||||
v512_32( K256[3] ) );
|
||||
A = _mm512_add_epi32( A, T1 );
|
||||
E = _mm512_add_epi32( T1, _mm512_add_epi32( BSG2_0x16(F),
|
||||
MAJx16(F, G, H) ) );
|
||||
@@ -1566,21 +1566,22 @@ int sha256_16way_transform_le_short( __m512i *state_out, const __m512i *data,
|
||||
SHA256_16WAY_ROUND( H, A, B, C, D, E, F, G, 9, 48 );
|
||||
|
||||
// rounds 58 to 60 part 1
|
||||
T0 = _mm512_add_epi32( _mm512_set1_epi32( K256[58] ),
|
||||
T0 = _mm512_add_epi32( v512_32( K256[58] ),
|
||||
mm512_add4_32( BSG2_1x16( C ), CHx16( C, D, E ), W[10], F ) );
|
||||
B = _mm512_add_epi32( B, T0 );
|
||||
|
||||
T1 = _mm512_add_epi32( _mm512_set1_epi32( K256[59] ),
|
||||
T1 = _mm512_add_epi32( v512_32( K256[59] ),
|
||||
mm512_add4_32( BSG2_1x16( B ), CHx16( B, C, D ), W[11], E ) );
|
||||
A = _mm512_add_epi32( A, T1 );
|
||||
|
||||
T2 = _mm512_add_epi32( _mm512_set1_epi32( K256[60] ),
|
||||
T2 = _mm512_add_epi32( v512_32( K256[60] ),
|
||||
mm512_add4_32( BSG2_1x16( A ), CHx16( A, B, C ), W[12], D ) );
|
||||
H = _mm512_add_epi32( H, T2 );
|
||||
|
||||
// got H, test it against target[7]
|
||||
hash = _mm512_shuffle_epi8( _mm512_add_epi32( H , IV7 ), bswap_shuf );
|
||||
targ = _mm512_set1_epi32( target[7] );
|
||||
hash = _mm512_shuffle_epi8( _mm512_add_epi32( H , IV7 ), bswap_shuf );
|
||||
targ = v512_32( target[7] );
|
||||
if ( target[7] )
|
||||
if ( likely( 0 == _mm512_cmple_epu32_mask( hash, targ ) ))
|
||||
return 0;
|
||||
t6_mask = _mm512_cmpeq_epi32_mask( hash, targ );
|
||||
@@ -1591,15 +1592,15 @@ int sha256_16way_transform_le_short( __m512i *state_out, const __m512i *data,
|
||||
|
||||
// round 61 part 1
|
||||
W[13] = SHA256_16WAY_MEXP( W[11], W[ 6], W[14], W[13] );
|
||||
T0 = _mm512_add_epi32( _mm512_set1_epi32( K256[61] ),
|
||||
T0 = _mm512_add_epi32( v512_32( K256[61] ),
|
||||
mm512_add4_32( BSG2_1x16( H ), CHx16( H, A, B ), W[13], C ) );
|
||||
G = _mm512_add_epi32( G, T0 );
|
||||
|
||||
// got G, test it against target[6] if indicated
|
||||
if ( t6_mask != 0 )
|
||||
if ( (uint16_t)t6_mask )
|
||||
{
|
||||
hash = _mm512_shuffle_epi8( _mm512_add_epi32( G, IV6 ), bswap_shuf );
|
||||
targ = _mm512_set1_epi32( target[6] );
|
||||
targ = v512_32( target[6] );
|
||||
if ( likely( 0 == _mm512_mask_cmple_epu32_mask( t6_mask, hash, targ ) ))
|
||||
return 0;
|
||||
}
|
||||
@@ -1637,14 +1638,14 @@ int sha256_16way_transform_le_short( __m512i *state_out, const __m512i *data,
|
||||
void sha256_16way_init( sha256_16way_context *sc )
|
||||
{
|
||||
sc->count_high = sc->count_low = 0;
|
||||
sc->val[0] = _mm512_set1_epi64( 0x6A09E6676A09E667 );
|
||||
sc->val[1] = _mm512_set1_epi64( 0xBB67AE85BB67AE85 );
|
||||
sc->val[2] = _mm512_set1_epi64( 0x3C6EF3723C6EF372 );
|
||||
sc->val[3] = _mm512_set1_epi64( 0xA54FF53AA54FF53A );
|
||||
sc->val[4] = _mm512_set1_epi64( 0x510E527F510E527F );
|
||||
sc->val[5] = _mm512_set1_epi64( 0x9B05688C9B05688C );
|
||||
sc->val[6] = _mm512_set1_epi64( 0x1F83D9AB1F83D9AB );
|
||||
sc->val[7] = _mm512_set1_epi64( 0x5BE0CD195BE0CD19 );
|
||||
sc->val[0] = v512_32( sha256_iv[0] );
|
||||
sc->val[1] = v512_32( sha256_iv[1] );
|
||||
sc->val[2] = v512_32( sha256_iv[2] );
|
||||
sc->val[3] = v512_32( sha256_iv[3] );
|
||||
sc->val[4] = v512_32( sha256_iv[4] );
|
||||
sc->val[5] = v512_32( sha256_iv[5] );
|
||||
sc->val[6] = v512_32( sha256_iv[6] );
|
||||
sc->val[7] = v512_32( sha256_iv[7] );
|
||||
}
|
||||
|
||||
void sha256_16way_update( sha256_16way_context *sc, const void *data,
|
||||
@@ -1688,7 +1689,7 @@ void sha256_16way_close( sha256_16way_context *sc, void *dst )
|
||||
const int pad = buf_size - 8;
|
||||
|
||||
ptr = (unsigned)sc->count_low & (buf_size - 1U);
|
||||
sc->buf[ ptr>>2 ] = _mm512_set1_epi64( 0x0000008000000080 );
|
||||
sc->buf[ ptr>>2 ] = v512_64( 0x0000008000000080 );
|
||||
ptr += 4;
|
||||
|
||||
if ( ptr > pad )
|
||||
@@ -1704,8 +1705,8 @@ void sha256_16way_close( sha256_16way_context *sc, void *dst )
|
||||
high = (sc->count_high << 3) | (low >> 29);
|
||||
low = low << 3;
|
||||
|
||||
sc->buf[ pad >> 2 ] = _mm512_set1_epi32( bswap_32( high ) );
|
||||
sc->buf[ ( pad+4 ) >> 2 ] = _mm512_set1_epi32( bswap_32( low ) );
|
||||
sc->buf[ pad >> 2 ] = v512_32( bswap_32( high ) );
|
||||
sc->buf[ ( pad+4 ) >> 2 ] = v512_32( bswap_32( low ) );
|
||||
|
||||
sha256_16way_transform_be( sc->val, sc->buf, sc->val );
|
||||
|
||||
|
File diff suppressed because it is too large
Load Diff
@@ -118,10 +118,10 @@ int scanhash_sha256d_16way( struct work *work, const uint32_t max_nonce,
|
||||
const uint32_t *ptarget = work->target;
|
||||
const uint32_t first_nonce = pdata[19];
|
||||
const uint32_t last_nonce = max_nonce - 16;
|
||||
const __m512i last_byte = _mm512_set1_epi32( 0x80000000 );
|
||||
const __m512i last_byte = v512_32( 0x80000000 );
|
||||
uint32_t n = first_nonce;
|
||||
const int thr_id = mythr->id;
|
||||
const __m512i sixteen = _mm512_set1_epi32( 16 );
|
||||
const __m512i sixteen = v512_32( 16 );
|
||||
const bool bench = opt_benchmark;
|
||||
const __m256i bswap_shuf = mm256_bcast_m128( _mm_set_epi64x(
|
||||
0x0c0d0e0f08090a0b, 0x0405060700010203 ) );
|
||||
@@ -130,42 +130,42 @@ int scanhash_sha256d_16way( struct work *work, const uint32_t max_nonce,
|
||||
sha256_transform_le( phash, pdata, sha256_iv );
|
||||
|
||||
// vectorize block 0 hash for second block
|
||||
mstate1[0] = _mm512_set1_epi32( phash[0] );
|
||||
mstate1[1] = _mm512_set1_epi32( phash[1] );
|
||||
mstate1[2] = _mm512_set1_epi32( phash[2] );
|
||||
mstate1[3] = _mm512_set1_epi32( phash[3] );
|
||||
mstate1[4] = _mm512_set1_epi32( phash[4] );
|
||||
mstate1[5] = _mm512_set1_epi32( phash[5] );
|
||||
mstate1[6] = _mm512_set1_epi32( phash[6] );
|
||||
mstate1[7] = _mm512_set1_epi32( phash[7] );
|
||||
mstate1[0] = v512_32( phash[0] );
|
||||
mstate1[1] = v512_32( phash[1] );
|
||||
mstate1[2] = v512_32( phash[2] );
|
||||
mstate1[3] = v512_32( phash[3] );
|
||||
mstate1[4] = v512_32( phash[4] );
|
||||
mstate1[5] = v512_32( phash[5] );
|
||||
mstate1[6] = v512_32( phash[6] );
|
||||
mstate1[7] = v512_32( phash[7] );
|
||||
|
||||
// second message block data, with nonce & padding
|
||||
buf[0] = _mm512_set1_epi32( pdata[16] );
|
||||
buf[1] = _mm512_set1_epi32( pdata[17] );
|
||||
buf[2] = _mm512_set1_epi32( pdata[18] );
|
||||
buf[0] = v512_32( pdata[16] );
|
||||
buf[1] = v512_32( pdata[17] );
|
||||
buf[2] = v512_32( pdata[18] );
|
||||
buf[3] = _mm512_set_epi32( n+15, n+14, n+13, n+12, n+11, n+10, n+ 9, n+ 8,
|
||||
n+ 7, n+ 6, n+ 5, n+ 4, n+ 3, n+ 2, n +1, n );
|
||||
buf[4] = last_byte;
|
||||
memset_zero_512( buf+5, 10 );
|
||||
buf[15] = _mm512_set1_epi32( 80*8 ); // bit count
|
||||
buf[15] = v512_32( 80*8 ); // bit count
|
||||
|
||||
// partially pre-expand & prehash second message block, avoiding the nonces
|
||||
sha256_16way_prehash_3rounds( mstate2, mexp_pre, buf, mstate1 );
|
||||
|
||||
// vectorize IV for second hash
|
||||
istate[0] = _mm512_set1_epi32( sha256_iv[0] );
|
||||
istate[1] = _mm512_set1_epi32( sha256_iv[1] );
|
||||
istate[2] = _mm512_set1_epi32( sha256_iv[2] );
|
||||
istate[3] = _mm512_set1_epi32( sha256_iv[3] );
|
||||
istate[4] = _mm512_set1_epi32( sha256_iv[4] );
|
||||
istate[5] = _mm512_set1_epi32( sha256_iv[5] );
|
||||
istate[6] = _mm512_set1_epi32( sha256_iv[6] );
|
||||
istate[7] = _mm512_set1_epi32( sha256_iv[7] );
|
||||
istate[0] = v512_32( sha256_iv[0] );
|
||||
istate[1] = v512_32( sha256_iv[1] );
|
||||
istate[2] = v512_32( sha256_iv[2] );
|
||||
istate[3] = v512_32( sha256_iv[3] );
|
||||
istate[4] = v512_32( sha256_iv[4] );
|
||||
istate[5] = v512_32( sha256_iv[5] );
|
||||
istate[6] = v512_32( sha256_iv[6] );
|
||||
istate[7] = v512_32( sha256_iv[7] );
|
||||
|
||||
// initialize padding for second hash
|
||||
block[ 8] = last_byte;
|
||||
memset_zero_512( block+9, 6 );
|
||||
block[15] = _mm512_set1_epi32( 32*8 ); // bit count
|
||||
block[15] = v512_32( 32*8 ); // bit count
|
||||
|
||||
do
|
||||
{
|
||||
@@ -216,33 +216,33 @@ int scanhash_sha256d_8way( struct work *work, const uint32_t max_nonce,
|
||||
__m256i *noncev = vdata + 19;
|
||||
const int thr_id = mythr->id;
|
||||
const bool bench = opt_benchmark;
|
||||
const __m256i last_byte = _mm256_set1_epi32( 0x80000000 );
|
||||
const __m256i eight = _mm256_set1_epi32( 8 );
|
||||
const __m256i last_byte = v256_32( 0x80000000 );
|
||||
const __m256i eight = v256_32( 8 );
|
||||
const __m256i bswap_shuf = mm256_bcast_m128( _mm_set_epi64x(
|
||||
0x0c0d0e0f08090a0b, 0x0405060700010203 ) );
|
||||
|
||||
for ( int i = 0; i < 19; i++ )
|
||||
vdata[i] = _mm256_set1_epi32( pdata[i] );
|
||||
vdata[i] = v256_32( pdata[i] );
|
||||
|
||||
*noncev = _mm256_set_epi32( n+ 7, n+ 6, n+ 5, n+ 4, n+ 3, n+ 2, n+1, n );
|
||||
|
||||
vdata[16+4] = last_byte;
|
||||
memset_zero_256( vdata+16 + 5, 10 );
|
||||
vdata[16+15] = _mm256_set1_epi32( 80*8 );
|
||||
vdata[16+15] = v256_32( 80*8 );
|
||||
|
||||
block[ 8] = last_byte;
|
||||
memset_zero_256( block + 9, 6 );
|
||||
block[15] = _mm256_set1_epi32( 32*8 );
|
||||
block[15] = v256_32( 32*8 );
|
||||
|
||||
// initialize state for second hash
|
||||
istate[0] = _mm256_set1_epi32( sha256_iv[0] );
|
||||
istate[1] = _mm256_set1_epi32( sha256_iv[1] );
|
||||
istate[2] = _mm256_set1_epi32( sha256_iv[2] );
|
||||
istate[3] = _mm256_set1_epi32( sha256_iv[3] );
|
||||
istate[4] = _mm256_set1_epi32( sha256_iv[4] );
|
||||
istate[5] = _mm256_set1_epi32( sha256_iv[5] );
|
||||
istate[6] = _mm256_set1_epi32( sha256_iv[6] );
|
||||
istate[7] = _mm256_set1_epi32( sha256_iv[7] );
|
||||
istate[0] = v256_32( sha256_iv[0] );
|
||||
istate[1] = v256_32( sha256_iv[1] );
|
||||
istate[2] = v256_32( sha256_iv[2] );
|
||||
istate[3] = v256_32( sha256_iv[3] );
|
||||
istate[4] = v256_32( sha256_iv[4] );
|
||||
istate[5] = v256_32( sha256_iv[5] );
|
||||
istate[6] = v256_32( sha256_iv[6] );
|
||||
istate[7] = v256_32( sha256_iv[7] );
|
||||
|
||||
sha256_8way_transform_le( mstate1, vdata, istate );
|
||||
|
||||
@@ -298,31 +298,31 @@ int scanhash_sha256d_4way( struct work *work, const uint32_t max_nonce,
|
||||
__m128i *noncev = vdata + 19;
|
||||
const int thr_id = mythr->id;
|
||||
const bool bench = opt_benchmark;
|
||||
const __m128i last_byte = _mm_set1_epi32( 0x80000000 );
|
||||
const __m128i four = _mm_set1_epi32( 4 );
|
||||
const __m128i last_byte = v128_32( 0x80000000 );
|
||||
const __m128i four = v128_32( 4 );
|
||||
|
||||
for ( int i = 0; i < 19; i++ )
|
||||
vdata[i] = _mm_set1_epi32( pdata[i] );
|
||||
vdata[i] = v128_32( pdata[i] );
|
||||
|
||||
*noncev = _mm_set_epi32( n+ 3, n+ 2, n+1, n );
|
||||
|
||||
vdata[16+4] = last_byte;
|
||||
memset_zero_128( vdata+16 + 5, 10 );
|
||||
vdata[16+15] = _mm_set1_epi32( 80*8 );
|
||||
vdata[16+15] = v128_32( 80*8 );
|
||||
|
||||
block[ 8] = last_byte;
|
||||
memset_zero_128( block + 9, 6 );
|
||||
block[15] = _mm_set1_epi32( 32*8 );
|
||||
block[15] = v128_32( 32*8 );
|
||||
|
||||
// initialize state
|
||||
istate[0] = _mm_set1_epi32( sha256_iv[0] );
|
||||
istate[1] = _mm_set1_epi32( sha256_iv[1] );
|
||||
istate[2] = _mm_set1_epi32( sha256_iv[2] );
|
||||
istate[3] = _mm_set1_epi32( sha256_iv[3] );
|
||||
istate[4] = _mm_set1_epi32( sha256_iv[4] );
|
||||
istate[5] = _mm_set1_epi32( sha256_iv[5] );
|
||||
istate[6] = _mm_set1_epi32( sha256_iv[6] );
|
||||
istate[7] = _mm_set1_epi32( sha256_iv[7] );
|
||||
istate[0] = v128_32( sha256_iv[0] );
|
||||
istate[1] = v128_32( sha256_iv[1] );
|
||||
istate[2] = v128_32( sha256_iv[2] );
|
||||
istate[3] = v128_32( sha256_iv[3] );
|
||||
istate[4] = v128_32( sha256_iv[4] );
|
||||
istate[5] = v128_32( sha256_iv[5] );
|
||||
istate[6] = v128_32( sha256_iv[6] );
|
||||
istate[7] = v128_32( sha256_iv[7] );
|
||||
|
||||
// hash first 64 bytes of data
|
||||
sha256_4way_transform_le( mstate, vdata, istate );
|
||||
|
@@ -51,8 +51,7 @@ int scanhash_sha256dt_sha( struct work *work, uint32_t max_nonce,
|
||||
// fill & pad second bock without nonce
|
||||
memcpy( block1a, pdata + 16, 12 );
|
||||
memcpy( block1b, pdata + 16, 12 );
|
||||
block1a[ 3] = 0;
|
||||
block1b[ 3] = 0;
|
||||
block1a[ 3] = block1b[ 3] = 0;
|
||||
block1a[ 4] = block1b[ 4] = 0x80000000;
|
||||
memset( block1a + 5, 0, 40 );
|
||||
memset( block1b + 5, 0, 40 );
|
||||
@@ -128,10 +127,10 @@ int scanhash_sha256dt_16way( struct work *work, const uint32_t max_nonce,
|
||||
const uint32_t *ptarget = work->target;
|
||||
const uint32_t first_nonce = pdata[19];
|
||||
const uint32_t last_nonce = max_nonce - 16;
|
||||
const __m512i last_byte = _mm512_set1_epi32( 0x80000000 );
|
||||
const __m512i last_byte = v512_32( 0x80000000 );
|
||||
uint32_t n = first_nonce;
|
||||
const int thr_id = mythr->id;
|
||||
const __m512i sixteen = _mm512_set1_epi32( 16 );
|
||||
const __m512i sixteen = v512_32( 16 );
|
||||
const bool bench = opt_benchmark;
|
||||
const __m256i bswap_shuf = mm256_bcast_m128( _mm_set_epi64x(
|
||||
0x0c0d0e0f08090a0b, 0x0405060700010203 ) );
|
||||
@@ -140,42 +139,42 @@ int scanhash_sha256dt_16way( struct work *work, const uint32_t max_nonce,
|
||||
sha256_transform_le( phash, pdata, sha256dt_iv );
|
||||
|
||||
// vectorize block 0 hash for second block
|
||||
mstate1[0] = _mm512_set1_epi32( phash[0] );
|
||||
mstate1[1] = _mm512_set1_epi32( phash[1] );
|
||||
mstate1[2] = _mm512_set1_epi32( phash[2] );
|
||||
mstate1[3] = _mm512_set1_epi32( phash[3] );
|
||||
mstate1[4] = _mm512_set1_epi32( phash[4] );
|
||||
mstate1[5] = _mm512_set1_epi32( phash[5] );
|
||||
mstate1[6] = _mm512_set1_epi32( phash[6] );
|
||||
mstate1[7] = _mm512_set1_epi32( phash[7] );
|
||||
mstate1[0] = v512_32( phash[0] );
|
||||
mstate1[1] = v512_32( phash[1] );
|
||||
mstate1[2] = v512_32( phash[2] );
|
||||
mstate1[3] = v512_32( phash[3] );
|
||||
mstate1[4] = v512_32( phash[4] );
|
||||
mstate1[5] = v512_32( phash[5] );
|
||||
mstate1[6] = v512_32( phash[6] );
|
||||
mstate1[7] = v512_32( phash[7] );
|
||||
|
||||
// second message block data, with nonce & padding
|
||||
buf[0] = _mm512_set1_epi32( pdata[16] );
|
||||
buf[1] = _mm512_set1_epi32( pdata[17] );
|
||||
buf[2] = _mm512_set1_epi32( pdata[18] );
|
||||
buf[0] = v512_32( pdata[16] );
|
||||
buf[1] = v512_32( pdata[17] );
|
||||
buf[2] = v512_32( pdata[18] );
|
||||
buf[3] = _mm512_set_epi32( n+15, n+14, n+13, n+12, n+11, n+10, n+ 9, n+ 8,
|
||||
n+ 7, n+ 6, n+ 5, n+ 4, n+ 3, n+ 2, n +1, n );
|
||||
buf[4] = last_byte;
|
||||
memset_zero_512( buf+5, 10 );
|
||||
buf[15] = _mm512_set1_epi32( 0x480 ); // sha256dt funky bit count
|
||||
buf[15] = v512_32( 0x480 ); // sha256dt funky bit count
|
||||
|
||||
// partially pre-expand & prehash second message block, avoiding the nonces
|
||||
sha256_16way_prehash_3rounds( mstate2, mexp_pre, buf, mstate1 );
|
||||
|
||||
// vectorize IV for second hash
|
||||
istate[0] = _mm512_set1_epi32( sha256dt_iv[0] );
|
||||
istate[1] = _mm512_set1_epi32( sha256dt_iv[1] );
|
||||
istate[2] = _mm512_set1_epi32( sha256dt_iv[2] );
|
||||
istate[3] = _mm512_set1_epi32( sha256dt_iv[3] );
|
||||
istate[4] = _mm512_set1_epi32( sha256dt_iv[4] );
|
||||
istate[5] = _mm512_set1_epi32( sha256dt_iv[5] );
|
||||
istate[6] = _mm512_set1_epi32( sha256dt_iv[6] );
|
||||
istate[7] = _mm512_set1_epi32( sha256dt_iv[7] );
|
||||
istate[0] = v512_32( sha256dt_iv[0] );
|
||||
istate[1] = v512_32( sha256dt_iv[1] );
|
||||
istate[2] = v512_32( sha256dt_iv[2] );
|
||||
istate[3] = v512_32( sha256dt_iv[3] );
|
||||
istate[4] = v512_32( sha256dt_iv[4] );
|
||||
istate[5] = v512_32( sha256dt_iv[5] );
|
||||
istate[6] = v512_32( sha256dt_iv[6] );
|
||||
istate[7] = v512_32( sha256dt_iv[7] );
|
||||
|
||||
// initialize padding for second hash
|
||||
block[ 8] = last_byte;
|
||||
memset_zero_512( block+9, 6 );
|
||||
block[15] = _mm512_set1_epi32( 0x300 ); // bit count
|
||||
block[15] = v512_32( 0x300 ); // bit count
|
||||
|
||||
do
|
||||
{
|
||||
@@ -226,33 +225,33 @@ int scanhash_sha256dt_8way( struct work *work, const uint32_t max_nonce,
|
||||
__m256i *noncev = vdata + 19;
|
||||
const int thr_id = mythr->id;
|
||||
const bool bench = opt_benchmark;
|
||||
const __m256i last_byte = _mm256_set1_epi32( 0x80000000 );
|
||||
const __m256i eight = _mm256_set1_epi32( 8 );
|
||||
const __m256i last_byte = v256_32( 0x80000000 );
|
||||
const __m256i eight = v256_32( 8 );
|
||||
const __m256i bswap_shuf = mm256_bcast_m128( _mm_set_epi64x(
|
||||
0x0c0d0e0f08090a0b, 0x0405060700010203 ) );
|
||||
|
||||
for ( int i = 0; i < 19; i++ )
|
||||
vdata[i] = _mm256_set1_epi32( pdata[i] );
|
||||
vdata[i] = v256_32( pdata[i] );
|
||||
|
||||
*noncev = _mm256_set_epi32( n+ 7, n+ 6, n+ 5, n+ 4, n+ 3, n+ 2, n+1, n );
|
||||
|
||||
vdata[16+4] = last_byte;
|
||||
memset_zero_256( vdata+16 + 5, 10 );
|
||||
vdata[16+15] = _mm256_set1_epi32( 0x480 );
|
||||
vdata[16+15] = v256_32( 0x480 );
|
||||
|
||||
block[ 8] = last_byte;
|
||||
memset_zero_256( block + 9, 6 );
|
||||
block[15] = _mm256_set1_epi32( 0x300 );
|
||||
block[15] = v256_32( 0x300 );
|
||||
|
||||
// initialize state for swecond hash
|
||||
istate[0] = _mm256_set1_epi64x( 0xdfa9bf2cdfa9bf2c );
|
||||
istate[1] = _mm256_set1_epi64x( 0xb72074d4b72074d4 );
|
||||
istate[2] = _mm256_set1_epi64x( 0x6bb011226bb01122 );
|
||||
istate[3] = _mm256_set1_epi64x( 0xd338e869d338e869 );
|
||||
istate[4] = _mm256_set1_epi64x( 0xaa3ff126aa3ff126 );
|
||||
istate[5] = _mm256_set1_epi64x( 0x475bbf30475bbf30 );
|
||||
istate[6] = _mm256_set1_epi64x( 0x8fd52e5b8fd52e5b );
|
||||
istate[7] = _mm256_set1_epi64x( 0x9f75c9ad9f75c9ad );
|
||||
// initialize state for second hash
|
||||
istate[0] = v256_32( sha256dt_iv[0] );
|
||||
istate[1] = v256_32( sha256dt_iv[1] );
|
||||
istate[2] = v256_32( sha256dt_iv[2] );
|
||||
istate[3] = v256_32( sha256dt_iv[3] );
|
||||
istate[4] = v256_32( sha256dt_iv[4] );
|
||||
istate[5] = v256_32( sha256dt_iv[5] );
|
||||
istate[6] = v256_32( sha256dt_iv[6] );
|
||||
istate[7] = v256_32( sha256dt_iv[7] );
|
||||
|
||||
sha256_8way_transform_le( mstate1, vdata, istate );
|
||||
|
||||
@@ -308,31 +307,31 @@ int scanhash_sha256dt_4way( struct work *work, const uint32_t max_nonce,
|
||||
__m128i *noncev = vdata + 19;
|
||||
const int thr_id = mythr->id;
|
||||
const bool bench = opt_benchmark;
|
||||
const __m128i last_byte = _mm_set1_epi32( 0x80000000 );
|
||||
const __m128i four = _mm_set1_epi32( 4 );
|
||||
const __m128i last_byte = v128_32( 0x80000000 );
|
||||
const __m128i four = v128_32( 4 );
|
||||
|
||||
for ( int i = 0; i < 19; i++ )
|
||||
vdata[i] = _mm_set1_epi32( pdata[i] );
|
||||
vdata[i] = v128_32( pdata[i] );
|
||||
|
||||
*noncev = _mm_set_epi32( n+ 3, n+ 2, n+1, n );
|
||||
|
||||
vdata[16+4] = last_byte;
|
||||
memset_zero_128( vdata+16 + 5, 10 );
|
||||
vdata[16+15] = _mm_set1_epi32( 0x480 );
|
||||
vdata[16+15] = v128_32( 0x480 );
|
||||
|
||||
block[ 8] = last_byte;
|
||||
memset_zero_128( block + 9, 6 );
|
||||
block[15] = _mm_set1_epi32( 0x300 );
|
||||
block[15] = v128_32( 0x300 );
|
||||
|
||||
// initialize state
|
||||
initstate[0] = _mm_set1_epi64x( 0xdfa9bf2cdfa9bf2c );
|
||||
initstate[1] = _mm_set1_epi64x( 0xb72074d4b72074d4 );
|
||||
initstate[2] = _mm_set1_epi64x( 0x6bb011226bb01122 );
|
||||
initstate[3] = _mm_set1_epi64x( 0xd338e869d338e869 );
|
||||
initstate[4] = _mm_set1_epi64x( 0xaa3ff126aa3ff126 );
|
||||
initstate[5] = _mm_set1_epi64x( 0x475bbf30475bbf30 );
|
||||
initstate[6] = _mm_set1_epi64x( 0x8fd52e5b8fd52e5b );
|
||||
initstate[7] = _mm_set1_epi64x( 0x9f75c9ad9f75c9ad );
|
||||
initstate[0] = v128_32( sha256dt_iv[0] );
|
||||
initstate[1] = v128_32( sha256dt_iv[1] );
|
||||
initstate[2] = v128_32( sha256dt_iv[2] );
|
||||
initstate[3] = v128_32( sha256dt_iv[3] );
|
||||
initstate[4] = v128_32( sha256dt_iv[4] );
|
||||
initstate[5] = v128_32( sha256dt_iv[5] );
|
||||
initstate[6] = v128_32( sha256dt_iv[6] );
|
||||
initstate[7] = v128_32( sha256dt_iv[7] );
|
||||
|
||||
// hash first 64 bytes of data
|
||||
sha256_4way_transform_le( midstate, vdata, initstate );
|
||||
@@ -342,21 +341,18 @@ int scanhash_sha256dt_4way( struct work *work, const uint32_t max_nonce,
|
||||
sha256_4way_transform_le( block, vdata+16, midstate );
|
||||
sha256_4way_transform_le( hash32, block, initstate );
|
||||
|
||||
// if ( sha256_4way_transform_le_short( hash32, block, initstate, ptarget ) )
|
||||
// {
|
||||
mm128_block_bswap_32( hash32, hash32 );
|
||||
mm128_block_bswap_32( hash32, hash32 );
|
||||
|
||||
for ( int lane = 0; lane < 4; lane++ )
|
||||
if ( unlikely( hash32_d7[ lane ] <= targ32_d7 ) )
|
||||
for ( int lane = 0; lane < 4; lane++ )
|
||||
if ( unlikely( hash32_d7[ lane ] <= targ32_d7 ) )
|
||||
{
|
||||
extr_lane_4x32( lane_hash, hash32, lane, 256 );
|
||||
if ( likely( valid_hash( lane_hash, ptarget ) && !bench ) )
|
||||
{
|
||||
extr_lane_4x32( lane_hash, hash32, lane, 256 );
|
||||
if ( likely( valid_hash( lane_hash, ptarget ) && !bench ) )
|
||||
{
|
||||
pdata[19] = n + lane;
|
||||
submit_solution( work, lane_hash, mythr );
|
||||
}
|
||||
pdata[19] = n + lane;
|
||||
submit_solution( work, lane_hash, mythr );
|
||||
}
|
||||
// }
|
||||
}
|
||||
*noncev = _mm_add_epi32( *noncev, four );
|
||||
n += 4;
|
||||
} while ( (n < last_nonce) && !work_restart[thr_id].restart );
|
||||
|
@@ -30,10 +30,10 @@ int scanhash_sha256t_16way( struct work *work, const uint32_t max_nonce,
|
||||
const uint32_t targ32_d7 = ptarget[7];
|
||||
const uint32_t first_nonce = pdata[19];
|
||||
const uint32_t last_nonce = max_nonce - 16;
|
||||
const __m512i last_byte = _mm512_set1_epi32( 0x80000000 );
|
||||
const __m512i last_byte = v512_32( 0x80000000 );
|
||||
uint32_t n = first_nonce;
|
||||
const int thr_id = mythr->id;
|
||||
const __m512i sixteen = _mm512_set1_epi32( 16 );
|
||||
const __m512i sixteen = v512_32( 16 );
|
||||
const bool bench = opt_benchmark;
|
||||
const __m256i bswap_shuf = mm256_bcast_m128( _mm_set_epi64x(
|
||||
0x0c0d0e0f08090a0b, 0x0405060700010203 ) );
|
||||
@@ -42,42 +42,42 @@ int scanhash_sha256t_16way( struct work *work, const uint32_t max_nonce,
|
||||
sha256_transform_le( phash, pdata, sha256_iv );
|
||||
|
||||
// vectorize block 0 hash for second block
|
||||
mstate1[0] = _mm512_set1_epi32( phash[0] );
|
||||
mstate1[1] = _mm512_set1_epi32( phash[1] );
|
||||
mstate1[2] = _mm512_set1_epi32( phash[2] );
|
||||
mstate1[3] = _mm512_set1_epi32( phash[3] );
|
||||
mstate1[4] = _mm512_set1_epi32( phash[4] );
|
||||
mstate1[5] = _mm512_set1_epi32( phash[5] );
|
||||
mstate1[6] = _mm512_set1_epi32( phash[6] );
|
||||
mstate1[7] = _mm512_set1_epi32( phash[7] );
|
||||
mstate1[0] = v512_32( phash[0] );
|
||||
mstate1[1] = v512_32( phash[1] );
|
||||
mstate1[2] = v512_32( phash[2] );
|
||||
mstate1[3] = v512_32( phash[3] );
|
||||
mstate1[4] = v512_32( phash[4] );
|
||||
mstate1[5] = v512_32( phash[5] );
|
||||
mstate1[6] = v512_32( phash[6] );
|
||||
mstate1[7] = v512_32( phash[7] );
|
||||
|
||||
// second message block data, with nonce & padding
|
||||
buf[0] = _mm512_set1_epi32( pdata[16] );
|
||||
buf[1] = _mm512_set1_epi32( pdata[17] );
|
||||
buf[2] = _mm512_set1_epi32( pdata[18] );
|
||||
buf[0] = v512_32( pdata[16] );
|
||||
buf[1] = v512_32( pdata[17] );
|
||||
buf[2] = v512_32( pdata[18] );
|
||||
buf[3] = _mm512_set_epi32( n+15, n+14, n+13, n+12, n+11, n+10, n+ 9, n+ 8,
|
||||
n+ 7, n+ 6, n+ 5, n+ 4, n+ 3, n+ 2, n +1, n );
|
||||
buf[4] = last_byte;
|
||||
memset_zero_512( buf+5, 10 );
|
||||
buf[15] = _mm512_set1_epi32( 80*8 ); // bit count
|
||||
buf[15] = v512_32( 80*8 ); // bit count
|
||||
|
||||
// partially pre-expand & prehash second message block, avoiding the nonces
|
||||
sha256_16way_prehash_3rounds( mstate2, mexp_pre, buf, mstate1 );
|
||||
|
||||
// vectorize IV for 2nd & 3rd sha256
|
||||
istate[0] = _mm512_set1_epi32( sha256_iv[0] );
|
||||
istate[1] = _mm512_set1_epi32( sha256_iv[1] );
|
||||
istate[2] = _mm512_set1_epi32( sha256_iv[2] );
|
||||
istate[3] = _mm512_set1_epi32( sha256_iv[3] );
|
||||
istate[4] = _mm512_set1_epi32( sha256_iv[4] );
|
||||
istate[5] = _mm512_set1_epi32( sha256_iv[5] );
|
||||
istate[6] = _mm512_set1_epi32( sha256_iv[6] );
|
||||
istate[7] = _mm512_set1_epi32( sha256_iv[7] );
|
||||
istate[0] = v512_32( sha256_iv[0] );
|
||||
istate[1] = v512_32( sha256_iv[1] );
|
||||
istate[2] = v512_32( sha256_iv[2] );
|
||||
istate[3] = v512_32( sha256_iv[3] );
|
||||
istate[4] = v512_32( sha256_iv[4] );
|
||||
istate[5] = v512_32( sha256_iv[5] );
|
||||
istate[6] = v512_32( sha256_iv[6] );
|
||||
istate[7] = v512_32( sha256_iv[7] );
|
||||
|
||||
// initialize padding for 2nd & 3rd sha256
|
||||
block[ 8] = last_byte;
|
||||
memset_zero_512( block + 9, 6 );
|
||||
block[15] = _mm512_set1_epi32( 32*8 ); // bit count
|
||||
block[15] = v512_32( 32*8 ); // bit count
|
||||
|
||||
do
|
||||
{
|
||||
@@ -222,33 +222,33 @@ int scanhash_sha256t_8way( struct work *work, const uint32_t max_nonce,
|
||||
__m256i *noncev = vdata + 19;
|
||||
const int thr_id = mythr->id;
|
||||
const bool bench = opt_benchmark;
|
||||
const __m256i last_byte = _mm256_set1_epi32( 0x80000000 );
|
||||
const __m256i eight = _mm256_set1_epi32( 8 );
|
||||
const __m256i last_byte = v256_32( 0x80000000 );
|
||||
const __m256i eight = v256_32( 8 );
|
||||
const __m256i bswap_shuf = mm256_bcast_m128( _mm_set_epi64x(
|
||||
0x0c0d0e0f08090a0b, 0x0405060700010203 ) );
|
||||
|
||||
for ( int i = 0; i < 19; i++ )
|
||||
vdata[i] = _mm256_set1_epi32( pdata[i] );
|
||||
vdata[i] = v256_32( pdata[i] );
|
||||
|
||||
*noncev = _mm256_set_epi32( n+ 7, n+ 6, n+ 5, n+ 4, n+ 3, n+ 2, n+1, n );
|
||||
|
||||
vdata[16+4] = last_byte;
|
||||
memset_zero_256( vdata+16 + 5, 10 );
|
||||
vdata[16+15] = _mm256_set1_epi32( 80*8 ); // bit count
|
||||
vdata[16+15] = v256_32( 80*8 ); // bit count
|
||||
|
||||
block[ 8] = last_byte;
|
||||
memset_zero_256( block + 9, 6 );
|
||||
block[15] = _mm256_set1_epi32( 32*8 ); // bit count
|
||||
block[15] = v256_32( 32*8 ); // bit count
|
||||
|
||||
// initialize state
|
||||
istate[0] = _mm256_set1_epi64x( 0x6A09E6676A09E667 );
|
||||
istate[1] = _mm256_set1_epi64x( 0xBB67AE85BB67AE85 );
|
||||
istate[2] = _mm256_set1_epi64x( 0x3C6EF3723C6EF372 );
|
||||
istate[3] = _mm256_set1_epi64x( 0xA54FF53AA54FF53A );
|
||||
istate[4] = _mm256_set1_epi64x( 0x510E527F510E527F );
|
||||
istate[5] = _mm256_set1_epi64x( 0x9B05688C9B05688C );
|
||||
istate[6] = _mm256_set1_epi64x( 0x1F83D9AB1F83D9AB );
|
||||
istate[7] = _mm256_set1_epi64x( 0x5BE0CD195BE0CD19 );
|
||||
istate[0] = v256_32( sha256_iv[0] );
|
||||
istate[1] = v256_32( sha256_iv[1] );
|
||||
istate[2] = v256_32( sha256_iv[2] );
|
||||
istate[3] = v256_32( sha256_iv[3] );
|
||||
istate[4] = v256_32( sha256_iv[4] );
|
||||
istate[5] = v256_32( sha256_iv[5] );
|
||||
istate[6] = v256_32( sha256_iv[6] );
|
||||
istate[7] = v256_32( sha256_iv[7] );
|
||||
|
||||
sha256_8way_transform_le( mstate1, vdata, istate );
|
||||
|
||||
@@ -313,31 +313,31 @@ int scanhash_sha256t_4way( struct work *work, const uint32_t max_nonce,
|
||||
__m128i *noncev = vdata + 19;
|
||||
const int thr_id = mythr->id;
|
||||
const bool bench = opt_benchmark;
|
||||
const __m128i last_byte = _mm_set1_epi32( 0x80000000 );
|
||||
const __m128i four = _mm_set1_epi32( 4 );
|
||||
const __m128i last_byte = v128_32( 0x80000000 );
|
||||
const __m128i four = v128_32( 4 );
|
||||
|
||||
for ( int i = 0; i < 19; i++ )
|
||||
vdata[i] = _mm_set1_epi32( pdata[i] );
|
||||
vdata[i] = v128_32( pdata[i] );
|
||||
|
||||
*noncev = _mm_set_epi32( n+ 3, n+ 2, n+1, n );
|
||||
|
||||
vdata[16+4] = last_byte;
|
||||
memset_zero_128( vdata+16 + 5, 10 );
|
||||
vdata[16+15] = _mm_set1_epi32( 80*8 ); // bit count
|
||||
vdata[16+15] = v128_32( 80*8 ); // bit count
|
||||
|
||||
block[ 8] = last_byte;
|
||||
memset_zero_128( block + 9, 6 );
|
||||
block[15] = _mm_set1_epi32( 32*8 ); // bit count
|
||||
block[15] = v128_32( 32*8 ); // bit count
|
||||
|
||||
// initialize state
|
||||
istate[0] = _mm_set1_epi64x( 0x6A09E6676A09E667 );
|
||||
istate[1] = _mm_set1_epi64x( 0xBB67AE85BB67AE85 );
|
||||
istate[2] = _mm_set1_epi64x( 0x3C6EF3723C6EF372 );
|
||||
istate[3] = _mm_set1_epi64x( 0xA54FF53AA54FF53A );
|
||||
istate[4] = _mm_set1_epi64x( 0x510E527F510E527F );
|
||||
istate[5] = _mm_set1_epi64x( 0x9B05688C9B05688C );
|
||||
istate[6] = _mm_set1_epi64x( 0x1F83D9AB1F83D9AB );
|
||||
istate[7] = _mm_set1_epi64x( 0x5BE0CD195BE0CD19 );
|
||||
istate[0] = v128_32( sha256_iv[0] );
|
||||
istate[1] = v128_32( sha256_iv[1] );
|
||||
istate[2] = v128_32( sha256_iv[2] );
|
||||
istate[3] = v128_32( sha256_iv[3] );
|
||||
istate[4] = v128_32( sha256_iv[4] );
|
||||
istate[5] = v128_32( sha256_iv[5] );
|
||||
istate[6] = v128_32( sha256_iv[6] );
|
||||
istate[7] = v128_32( sha256_iv[7] );
|
||||
|
||||
// hash first 64 bytes of data
|
||||
sha256_4way_transform_le( mstate, vdata, istate );
|
||||
|
@@ -39,57 +39,429 @@
|
||||
/*
|
||||
static const uit64_t H512[8] =
|
||||
{
|
||||
0x6A09E667F3BCC908, 0xBB67AE8584CAA73B,
|
||||
0x3C6EF372FE94F82B, 0xA54FF53A5F1D36F1,
|
||||
0x510E527FADE682D1, 0x9B05688C2B3E6C1F,
|
||||
0x1F83D9ABFB41BD6B, 0x5BE0CD19137E2179
|
||||
0x6A09E667F3BCC908, 0xBB67AE8584CAA73B, 0x3C6EF372FE94F82B, 0xA54FF53A5F1D36F1,
|
||||
0x510E527FADE682D1, 0x9B05688C2B3E6C1F, 0x1F83D9ABFB41BD6B, 0x5BE0CD19137E2179
|
||||
};
|
||||
*/
|
||||
|
||||
static const uint64_t K512[80] =
|
||||
{
|
||||
0x428A2F98D728AE22, 0x7137449123EF65CD,
|
||||
0xB5C0FBCFEC4D3B2F, 0xE9B5DBA58189DBBC,
|
||||
0x3956C25BF348B538, 0x59F111F1B605D019,
|
||||
0x923F82A4AF194F9B, 0xAB1C5ED5DA6D8118,
|
||||
0xD807AA98A3030242, 0x12835B0145706FBE,
|
||||
0x243185BE4EE4B28C, 0x550C7DC3D5FFB4E2,
|
||||
0x72BE5D74F27B896F, 0x80DEB1FE3B1696B1,
|
||||
0x9BDC06A725C71235, 0xC19BF174CF692694,
|
||||
0xE49B69C19EF14AD2, 0xEFBE4786384F25E3,
|
||||
0x0FC19DC68B8CD5B5, 0x240CA1CC77AC9C65,
|
||||
0x2DE92C6F592B0275, 0x4A7484AA6EA6E483,
|
||||
0x5CB0A9DCBD41FBD4, 0x76F988DA831153B5,
|
||||
0x983E5152EE66DFAB, 0xA831C66D2DB43210,
|
||||
0xB00327C898FB213F, 0xBF597FC7BEEF0EE4,
|
||||
0xC6E00BF33DA88FC2, 0xD5A79147930AA725,
|
||||
0x06CA6351E003826F, 0x142929670A0E6E70,
|
||||
0x27B70A8546D22FFC, 0x2E1B21385C26C926,
|
||||
0x4D2C6DFC5AC42AED, 0x53380D139D95B3DF,
|
||||
0x650A73548BAF63DE, 0x766A0ABB3C77B2A8,
|
||||
0x81C2C92E47EDAEE6, 0x92722C851482353B,
|
||||
0xA2BFE8A14CF10364, 0xA81A664BBC423001,
|
||||
0xC24B8B70D0F89791, 0xC76C51A30654BE30,
|
||||
0xD192E819D6EF5218, 0xD69906245565A910,
|
||||
0xF40E35855771202A, 0x106AA07032BBD1B8,
|
||||
0x19A4C116B8D2D0C8, 0x1E376C085141AB53,
|
||||
0x2748774CDF8EEB99, 0x34B0BCB5E19B48A8,
|
||||
0x391C0CB3C5C95A63, 0x4ED8AA4AE3418ACB,
|
||||
0x5B9CCA4F7763E373, 0x682E6FF3D6B2B8A3,
|
||||
0x748F82EE5DEFB2FC, 0x78A5636F43172F60,
|
||||
0x84C87814A1F0AB72, 0x8CC702081A6439EC,
|
||||
0x90BEFFFA23631E28, 0xA4506CEBDE82BDE9,
|
||||
0xBEF9A3F7B2C67915, 0xC67178F2E372532B,
|
||||
0xCA273ECEEA26619C, 0xD186B8C721C0C207,
|
||||
0xEADA7DD6CDE0EB1E, 0xF57D4F7FEE6ED178,
|
||||
0x06F067AA72176FBA, 0x0A637DC5A2C898A6,
|
||||
0x113F9804BEF90DAE, 0x1B710B35131C471B,
|
||||
0x28DB77F523047D84, 0x32CAAB7B40C72493,
|
||||
0x3C9EBE0A15C9BEBC, 0x431D67C49C100D4C,
|
||||
0x4CC5D4BECB3E42B6, 0x597F299CFC657E2A,
|
||||
0x5FCB6FAB3AD6FAEC, 0x6C44198C4A475817
|
||||
0x428A2F98D728AE22, 0x7137449123EF65CD, 0xB5C0FBCFEC4D3B2F, 0xE9B5DBA58189DBBC,
|
||||
0x3956C25BF348B538, 0x59F111F1B605D019, 0x923F82A4AF194F9B, 0xAB1C5ED5DA6D8118,
|
||||
0xD807AA98A3030242, 0x12835B0145706FBE, 0x243185BE4EE4B28C, 0x550C7DC3D5FFB4E2,
|
||||
0x72BE5D74F27B896F, 0x80DEB1FE3B1696B1, 0x9BDC06A725C71235, 0xC19BF174CF692694,
|
||||
0xE49B69C19EF14AD2, 0xEFBE4786384F25E3, 0x0FC19DC68B8CD5B5, 0x240CA1CC77AC9C65,
|
||||
0x2DE92C6F592B0275, 0x4A7484AA6EA6E483, 0x5CB0A9DCBD41FBD4, 0x76F988DA831153B5,
|
||||
0x983E5152EE66DFAB, 0xA831C66D2DB43210, 0xB00327C898FB213F, 0xBF597FC7BEEF0EE4,
|
||||
0xC6E00BF33DA88FC2, 0xD5A79147930AA725, 0x06CA6351E003826F, 0x142929670A0E6E70,
|
||||
0x27B70A8546D22FFC, 0x2E1B21385C26C926, 0x4D2C6DFC5AC42AED, 0x53380D139D95B3DF,
|
||||
0x650A73548BAF63DE, 0x766A0ABB3C77B2A8, 0x81C2C92E47EDAEE6, 0x92722C851482353B,
|
||||
0xA2BFE8A14CF10364, 0xA81A664BBC423001, 0xC24B8B70D0F89791, 0xC76C51A30654BE30,
|
||||
0xD192E819D6EF5218, 0xD69906245565A910, 0xF40E35855771202A, 0x106AA07032BBD1B8,
|
||||
0x19A4C116B8D2D0C8, 0x1E376C085141AB53, 0x2748774CDF8EEB99, 0x34B0BCB5E19B48A8,
|
||||
0x391C0CB3C5C95A63, 0x4ED8AA4AE3418ACB, 0x5B9CCA4F7763E373, 0x682E6FF3D6B2B8A3,
|
||||
0x748F82EE5DEFB2FC, 0x78A5636F43172F60, 0x84C87814A1F0AB72, 0x8CC702081A6439EC,
|
||||
0x90BEFFFA23631E28, 0xA4506CEBDE82BDE9, 0xBEF9A3F7B2C67915, 0xC67178F2E372532B,
|
||||
0xCA273ECEEA26619C, 0xD186B8C721C0C207, 0xEADA7DD6CDE0EB1E, 0xF57D4F7FEE6ED178,
|
||||
0x06F067AA72176FBA, 0x0A637DC5A2C898A6, 0x113F9804BEF90DAE, 0x1B710B35131C471B,
|
||||
0x28DB77F523047D84, 0x32CAAB7B40C72493, 0x3C9EBE0A15C9BEBC, 0x431D67C49C100D4C,
|
||||
0x4CC5D4BECB3E42B6, 0x597F299CFC657E2A, 0x5FCB6FAB3AD6FAEC, 0x6C44198C4A475817
|
||||
};
|
||||
|
||||
#if defined(__AVX2__) && defined(__SHA512__)
|
||||
|
||||
// SHA-512 implemented using SHA512 CPU extension.
|
||||
|
||||
// Experimental. Not tested. Not reviewed. Compile tested only.
|
||||
|
||||
// Needs GCC-13 for compilation.
|
||||
// Needs Intel Lunar lake or Arrow Lake CPU, or AMD Zen-{5,6}? for execution.
|
||||
// Modelled after noloader sha256 implementation.
|
||||
|
||||
// It's not clear how SHA512 will be supported before AVX10 considering how
|
||||
// dependant it is on _mm256_alignr_epi64 which is only available with AVX512VL
|
||||
// until AVX10-256.
|
||||
|
||||
#if defined(__AVX512VL__)
|
||||
|
||||
#define mm256_alignr_1x64( v1, v0 ) _mm256_alignr_epi64( v1, v0, 1 )
|
||||
|
||||
#else
|
||||
// Ugly workaround to make it work with AVX2
|
||||
|
||||
static const __m256i mask __attribute__ ((aligned (32)))
|
||||
= { 0xffffffffffffffffull, 0ull, 0ull, 0ull };
|
||||
|
||||
#define mm256_alignr_1x64( v1, v0 ) \
|
||||
_mm256_or_si256( _mm256_and_si256( mm256_shuflr_64( v1 ), mask ), \
|
||||
_mm256_and_si256( mm256_shuflr_64( v0 ), mm256_not(mask) ) );
|
||||
|
||||
#endif
|
||||
|
||||
void sha512_opt_transform_be( uint64_t *state_out, const void *input,
|
||||
const uint64_t *state_in )
|
||||
{
|
||||
__m256i STATE0, STATE1;
|
||||
__m256i MSG, TMP, BSWAP64;
|
||||
__m256i TMSG0, TMSG1, TMSG2, TMSG3;
|
||||
__m256i ABEF_SAVE, CDGH_SAVE;
|
||||
|
||||
// Load initial values
|
||||
TMP = _mm256_load_si256( (__m256i*) &state_in[0] );
|
||||
STATE1 = _mm256_load_si256( (__m256i*) &state_in[4] );
|
||||
BSWAP64 = mm256_bcast_m128( _mm_set_epi64x( 0x08090a0b0c0d0e0f,
|
||||
0x0001020304050607 ) )
|
||||
TMP = _mm256_permute4x64_epi64( TMP, 0xB1 ); // CDAB
|
||||
STATE1 = _mm256_permute4x64_epi64( STATE1, 0x1B ); // EFGH
|
||||
STATE0 = _mm256_permute2x128_si256( TMP, STATE1, 0x21 ); // ABEF
|
||||
STATE1 = _mm256_blend_epi32( STATE1, TMP, 0xF0 ); // CDGH
|
||||
|
||||
// Save initial state
|
||||
ABEF_SAVE = STATE0;
|
||||
CDGH_SAVE = STATE1;
|
||||
|
||||
// Rounds 0-3
|
||||
TMSG0 = _mm256_load_si256( (const __m256i*) (input+0) );
|
||||
TMSG0 = _mm256_shuffle_epi8( TMSG0, BSWAP64 );
|
||||
MSG = _mm256_add_epi64( TMSG0, casti_m256i( K512, 0 ) );
|
||||
STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, MSG );
|
||||
MSG = _mm256_permute4x64_epi64( MSG, 0x0E );
|
||||
STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, MSG );
|
||||
|
||||
// Rounds 4-7
|
||||
TMSG1 = _mm256_load_si256( (const __m256i*) (input+16) );
|
||||
TMSG1 = _mm256_shuffle_epi8( TMSG1, BSWAP64 );
|
||||
MSG = _mm256_add_epi64( TMSG1, casti_m256i( K512, 1 ) );
|
||||
STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, MSG );
|
||||
MSG = _mm256_permute4x64_epi64( MSG, 0x0E );
|
||||
STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, MSG );
|
||||
TMSG0 = _mm256_sha512msg1_epi64( TMSG0, TMSG1 );
|
||||
|
||||
// Rounds 8-11
|
||||
TMSG2 = _mm256_load_si256( (const __m256i*) (input+32) );
|
||||
TMSG2 = _mm256_shuffle_epi8( TMSG2, BSWAP64 );
|
||||
MSG = _mm256_add_epi64( TMSG2, casti_m256i( K512, 2 ) );
|
||||
STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, MSG );
|
||||
MSG = _mm256_permute4x64_epi64( MSG, 0x0E );
|
||||
STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, MSG );
|
||||
TMSG1 = _mm256_sha512msg1_epi64( TMSG1, TMSG2 );
|
||||
|
||||
// Rounds 12-15
|
||||
TMSG3 = _mm256_load_si256( (const __m256i*) (input+48) );
|
||||
TMSG3 = _mm256_shuffle_epi8( TMSG3, BSWAP64 );
|
||||
MSG = _mm256_add_epi64( TMSG3, casti_m256i( K512, 3 ) );
|
||||
STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, MSG );
|
||||
TMP = _mm256_shuffle2_64( TMSG3, TMSG2, 1 );
|
||||
TMSG0 = _mm256_add_epi32( TMSG0, TMP );
|
||||
TMSG0 = _mm256_sha512msg2_epi64( TMSG0, TMSG3 );
|
||||
MSG = _mm256_permute4x64_epi64( MSG, 0x0E );
|
||||
STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, MSG );
|
||||
TMSG2 = _mm256_sha512msg1_epi64( TMSG2, TMSG3 );
|
||||
|
||||
// Rounds 16-19
|
||||
MSG = _mm256_add_epi64( TMSG0, casti_m256i( K512, 4 ) );
|
||||
STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, MSG );
|
||||
TMP = mm256_alignr_1x64( TMSG0, TMSG3 );
|
||||
TMSG1 = _mm256_add_epi64( TMSG1, TMP );
|
||||
TMSG1 = _mm256_sha512msg2_epi64( TMSG1, TMSG0 );
|
||||
MSG = _mm256_permute4x64_epi64( MSG, 0x0E );
|
||||
STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, MSG );
|
||||
TMSG3 = _mm256_sha512msg1_epi64( TMSG3, TMSG0 );
|
||||
|
||||
// Rounds 20-23
|
||||
MSG = _mm256_add_epi64( TMSG1, casti_m256i( K512, 5 ) );
|
||||
STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, MSG );
|
||||
TMP = mm256_alignr_1x64( TMSG1, TMSG0 );
|
||||
TMSG2 = _mm256_add_epi64( TMSG2, TMP );
|
||||
TMSG2 = _mm256_sha512msg2_epi64( TMSG2, TMSG1 );
|
||||
MSG = _mm256_permute4x64_epi64( MSG, 0x0E );
|
||||
STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, MSG );
|
||||
TMSG0 = _mm256_sha512msg1_epi64( TMSG0, TMSG1 );
|
||||
|
||||
// Rounds 24-27
|
||||
MSG = _mm256_add_epi64( TMSG2, casti_m256i( K512, 6 ) );
|
||||
STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, MSG );
|
||||
TMP = mm256_alignr_1x64( TMSG2, TMSG1 );
|
||||
TMSG3 = _mm256_add_epi32( TMSG3, TMP );
|
||||
TMSG3 = _mm256_sha512msg2_epi64( TMSG3, TMSG2 );
|
||||
MSG = _mm256_permute4x64_epi64( MSG, 0x0E );
|
||||
STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, MSG );
|
||||
TMSG1 = _mm256_sha512msg1_epi64( TMSG1, TMSG2 );
|
||||
|
||||
// Rounds 28-31
|
||||
MSG = _mm256_add_epi64( TMSG3, casti_m256i( K512, 7 ) );
|
||||
STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, MSG) ;
|
||||
TMP = mm256_alignr_1x64( TMSG3, TMSG2 );
|
||||
TMSG0 = _mm256_add_epi64( TMSG0, TMP );
|
||||
TMSG0 = _mm256_sha512msg2_epi64( TMSG0, TMSG3 );
|
||||
MSG = _mm256_permute4x64_epi64( MSG, 0x0E );
|
||||
STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, MSG );
|
||||
TMSG2 = _mm256_sha512msg1_epi64( TMSG2, TMSG3 );
|
||||
|
||||
// Rounds 32-35
|
||||
MSG = _mm256_add_epi64( TMSG0, casti_m256i( K512, 8 ) );
|
||||
STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, MSG );
|
||||
TMP = mm256_alignr_1x64( TMSG0, TMSG3 );
|
||||
TMSG1 = _mm256_add_epi64( TMSG1, TMP );
|
||||
TMSG1 = _mm256_sha512msg2_epi64( TMSG1, TMSG0 );
|
||||
MSG = _mm256_permute4x64_epi64( MSG, 0x0E );
|
||||
STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, MSG );
|
||||
TMSG3 = _mm256_sha512msg1_epi64( TMSG3, TMSG0 );
|
||||
|
||||
// Rounds 36-39
|
||||
MSG = _mm256_add_epi64( TMSG1, casti_m256i( K512, 9 ) );
|
||||
STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, MSG );
|
||||
TMP = mm256_alignr_1x64( TMSG1, TMSG0 );
|
||||
TMSG2 = _mm256_add_epi64( TMSG2, TMP );
|
||||
TMSG2 = _mm256_sha512msg2_epi64( TMSG2, TMSG1 );
|
||||
MSG = _mm256_permute4x64_epi64( MSG, 0x0E );
|
||||
STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, MSG );
|
||||
TMSG0 = _mm256_sha512msg1_epi64( TMSG0, TMSG1 );
|
||||
|
||||
// Rounds 40-43
|
||||
MSG = _mm256_add_epi64( TMSG2, casti_m256i( K512, 10 ) );
|
||||
STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, MSG );
|
||||
TMP = mm256_alignr_1x64( TMSG2, TMSG1 );
|
||||
TMSG3 = _mm256_add_epi64( TMSG3, TMP );
|
||||
TMSG3 = _mm256_sha512msg2_epi64( TMSG3, TMSG2 );
|
||||
MSG = _mm256_permute4x64_epi64( MSG, 0x0E );
|
||||
STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, MSG );
|
||||
TMSG1 = _mm256_sha512msg1_epi64( TMSG1, TMSG2 );
|
||||
|
||||
// Rounds 44-47
|
||||
MSG = _mm256_add_epi64( TMSG3, casti_m256i( K512, 11 ) );
|
||||
STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, MSG );
|
||||
TMP = mm256_alignr_1x64( TMSG3, TMSG2 );
|
||||
TMSG0 = _mm256_add_epi64( TMSG0, TMP );
|
||||
TMSG0 = _mm256_sha512msg2_epi64( TMSG0, TMSG3 );
|
||||
MSG = _mm256_permute4x64_epi64( MSG, 0x0E );
|
||||
STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, MSG );
|
||||
TMSG2 = _mm256_sha512msg1_epi64( TMSG2, TMSG3 );
|
||||
|
||||
// Rounds 48-51
|
||||
MSG = _mm256_add_epi64( TMSG0, casti_m256i( K512, 12 ) );
|
||||
STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, MSG );
|
||||
TMP = mm256_alignr_1x64( TMSG0, TMSG3 );
|
||||
TMSG1 = _mm256_add_epi64( TMSG1, TMP );
|
||||
TMSG1 = _mm256_sha512msg2_epi64( TMSG1, TMSG0 );
|
||||
MSG = _mm256_permute4x64_epi64( MSG, 0x0E );
|
||||
STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, MSG );
|
||||
TMSG3 = _mm256_sha512msg1_epi64( TMSG3, TMSG0 );
|
||||
|
||||
// Rounds 52-55
|
||||
MSG = _mm256_add_epi64( TMSG1, casti_m256i( K512, 13 ) );
|
||||
STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, MSG );
|
||||
TMP = mm256_alignr_1x64( TMSG1, TMSG0 );
|
||||
TMSG2 = _mm256_add_epi64( TMSG2, TMP );
|
||||
TMSG2 = _mm256_sha512msg2_epi64( TMSG2, TMSG1 );
|
||||
MSG = _mm256_permute4x64_epi64( MSG, 0x0E );
|
||||
STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, MSG );
|
||||
|
||||
// Rounds 56-59
|
||||
MSG = _mm256_add_epi64( TMSG2, casti_m256i( K512, 14 ) );
|
||||
STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, MSG );
|
||||
TMP = mm256_alignr_1x64( TMSG2, TMSG1 ) ;
|
||||
TMSG3 = _mm256_add_epi64( TMSG3, TMP );
|
||||
TMSG3 = _mm256_sha512msg2_epi64( TMSG3, TMSG2 );
|
||||
MSG = _mm256_permute4x64_epi64( MSG, 0x0E );
|
||||
STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, MSG );
|
||||
|
||||
// Rounds 60-63
|
||||
MSG = _mm256_add_epi64( TMSG3, casti_m256i( K512, 15 ) );
|
||||
STATE1 = _mm256_sha512nds2_epi64( STATE1, STATE0, MSG );
|
||||
MSG = _mm256_permute4x64_epi64( MSG, 0x0E );
|
||||
STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, MSG );
|
||||
|
||||
// Add initial state
|
||||
STATE0 = _mm256_add_epi64( STATE0, ABEF_SAVE );
|
||||
STATE1 = _mm256_add_epi64( STATE1, CDGH_SAVE );
|
||||
|
||||
TMP = _mm256_permute4x64_epi64( STATE0, 0x1B ); // FEBA
|
||||
STATE1 = _mm256_permute4x64_epi64( STATE1, 0xB1 ); // DCHG
|
||||
STATE0 = _mm256_blend_epi32( TMP, STATE1, 0xF0 ); // DCBA
|
||||
STATE1 = _mm256_permute2x128_si256( STATE1, TMP, 0x21 ); // ABEF
|
||||
|
||||
// Save state
|
||||
_mm256_store_si256((__m256i*) &state_out[0], STATE0 );
|
||||
_mm256_store_si256((__m256i*) &state_out[4], STATE1 );
|
||||
}
|
||||
|
||||
void sha512_opt_transform_le( uint64_t *state_out, const void *input,
|
||||
const uint64_t *state_in )
|
||||
{
|
||||
__m256i STATE0, STATE1;
|
||||
__m256i MSG, TMP, BSWAP64;
|
||||
__m256i TMSG0, TMSG1, TMSG2, TMSG3;
|
||||
__m256i ABEF_SAVE, CDGH_SAVE;
|
||||
|
||||
// Load initial values
|
||||
TMP = _mm256_load_si256( (__m256i*) &state_in[0] );
|
||||
STATE1 = _mm256_load_si256( (__m256i*) &state_in[4] );
|
||||
TMP = _mm256_permute4x64_epi64( TMP, 0xB1 ); // CDAB
|
||||
STATE1 = _mm256_permute4x64_epi64( STATE1, 0x1B ); // EFGH
|
||||
STATE0 = _mm256_permute2x128_si256( TMP, STATE1, 0x21 ); // ABEF
|
||||
STATE1 = _mm256_blend_epi32( STATE1, TMP, 0xF0 ); // CDGH
|
||||
|
||||
// Save initial state
|
||||
ABEF_SAVE = STATE0;
|
||||
CDGH_SAVE = STATE1;
|
||||
|
||||
// Rounds 0-3
|
||||
TMSG0 = _mm256_load_si256( (const __m256i*) (input+0) );
|
||||
MSG = _mm256_add_epi64( TMSG0, casti_m256i( K512, 0 ) );
|
||||
STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, MSG );
|
||||
MSG = _mm256_permute4x64_epi64( MSG, 0x0E );
|
||||
STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, MSG );
|
||||
|
||||
// Rounds 4-7
|
||||
TMSG1 = _mm256_load_si256( (const __m256i*) (input+16) );
|
||||
MSG = _mm256_add_epi64( TMSG1, casti_m256i( K512, 1 ) );
|
||||
STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, MSG );
|
||||
MSG = _mm256_permute4x64_epi64( MSG, 0x0E );
|
||||
STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, MSG );
|
||||
TMSG0 = _mm256_sha512msg1_epi64( TMSG0, TMSG1 );
|
||||
|
||||
// Rounds 8-11
|
||||
TMSG2 = _mm256_load_si256( (const __m256i*) (input+32) );
|
||||
MSG = _mm256_add_epi64( TMSG2, casti_m256i( K512, 2 ) );
|
||||
STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, MSG );
|
||||
MSG = _mm256_permute4x64_epi64( MSG, 0x0E );
|
||||
STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, MSG );
|
||||
TMSG1 = _mm256_sha512msg1_epi64( TMSG1, TMSG2 );
|
||||
|
||||
// Rounds 12-15
|
||||
TMSG3 = _mm256_load_si256( (const __m256i*) (input+48) );
|
||||
MSG = _mm256_add_epi64( TMSG3, casti_m256i( K512, 3 ) );
|
||||
STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, MSG );
|
||||
TMP = _mm256_shuffle2_64( TMSG3, TMSG2, 1 );
|
||||
TMSG0 = _mm256_add_epi32( TMSG0, TMP );
|
||||
TMSG0 = _mm256_sha512msg2_epi64( TMSG0, TMSG3 );
|
||||
MSG = _mm256_permute4x64_epi64( MSG, 0x0E );
|
||||
STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, MSG );
|
||||
TMSG2 = _mm256_sha512msg1_epi64( TMSG2, TMSG3 );
|
||||
|
||||
// Rounds 16-19
|
||||
MSG = _mm256_add_epi64( TMSG0, casti_m256i( K512, 4 ) );
|
||||
STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, MSG );
|
||||
TMP = mm256_alignr_1x64( TMSG0, TMSG3 );
|
||||
TMSG1 = _mm256_add_epi64( TMSG1, TMP );
|
||||
TMSG1 = _mm256_sha512msg2_epi64( TMSG1, TMSG0 );
|
||||
MSG = _mm256_permute4x64_epi64( MSG, 0x0E );
|
||||
STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, MSG );
|
||||
TMSG3 = _mm256_sha512msg1_epi64( TMSG3, TMSG0 );
|
||||
|
||||
// Rounds 20-23
|
||||
MSG = _mm256_add_epi64( TMSG1, casti_m256i( K512, 5 ) );
|
||||
STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, MSG );
|
||||
TMP = mm256_alignr_1x64( TMSG1, TMSG0 );
|
||||
TMSG2 = _mm256_add_epi64( TMSG2, TMP );
|
||||
TMSG2 = _mm256_sha512msg2_epi64( TMSG2, TMSG1 );
|
||||
MSG = _mm256_permute4x64_epi64( MSG, 0x0E );
|
||||
STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, MSG );
|
||||
TMSG0 = _mm256_sha512msg1_epi64( TMSG0, TMSG1 );
|
||||
|
||||
// Rounds 24-27
|
||||
MSG = _mm256_add_epi64( TMSG2, casti_m256i( K512, 6 ) );
|
||||
STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, MSG );
|
||||
TMP = mm256_alignr_1x64( TMSG2, TMSG1 );
|
||||
TMSG3 = _mm256_add_epi32( TMSG3, TMP );
|
||||
TMSG3 = _mm256_sha512msg2_epi64( TMSG3, TMSG2 );
|
||||
MSG = _mm256_permute4x64_epi64( MSG, 0x0E );
|
||||
STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, MSG );
|
||||
TMSG1 = _mm256_sha512msg1_epi64( TMSG1, TMSG2 );
|
||||
|
||||
// Rounds 28-31
|
||||
MSG = _mm256_add_epi64( TMSG3, casti_m256i( K512, 7 ) );
|
||||
STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, MSG) ;
|
||||
TMP = mm256_alignr_1x64( TMSG3, TMSG2 );
|
||||
TMSG0 = _mm256_add_epi64( TMSG0, TMP );
|
||||
TMSG0 = _mm256_sha512msg2_epi64( TMSG0, TMSG3 );
|
||||
MSG = _mm256_permute4x64_epi64( MSG, 0x0E );
|
||||
STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, MSG );
|
||||
TMSG2 = _mm256_sha512msg1_epi64( TMSG2, TMSG3 );
|
||||
|
||||
// Rounds 32-35
|
||||
MSG = _mm256_add_epi64( TMSG0, casti_m256i( K512, 8 ) );
|
||||
STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, MSG );
|
||||
TMP = mm256_alignr_1x64( TMSG0, TMSG3 );
|
||||
TMSG1 = _mm256_add_epi64( TMSG1, TMP );
|
||||
TMSG1 = _mm256_sha512msg2_epi64( TMSG1, TMSG0 );
|
||||
MSG = _mm256_permute4x64_epi64( MSG, 0x0E );
|
||||
STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, MSG );
|
||||
TMSG3 = _mm256_sha512msg1_epi64( TMSG3, TMSG0 );
|
||||
|
||||
// Rounds 36-39
|
||||
MSG = _mm256_add_epi64( TMSG1, casti_m256i( K512, 9 ) );
|
||||
STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, MSG );
|
||||
TMP = mm256_alignr_1x64( TMSG1, TMSG0 );
|
||||
TMSG2 = _mm256_add_epi64( TMSG2, TMP );
|
||||
TMSG2 = _mm256_sha512msg2_epi64( TMSG2, TMSG1 );
|
||||
MSG = _mm256_permute4x64_epi64( MSG, 0x0E );
|
||||
STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, MSG );
|
||||
TMSG0 = _mm256_sha512msg1_epi64( TMSG0, TMSG1 );
|
||||
|
||||
// Rounds 40-43
|
||||
MSG = _mm256_add_epi64( TMSG2, casti_m256i( K512, 10 ) );
|
||||
STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, MSG );
|
||||
TMP = mm256_alignr_1x64( TMSG2, TMSG1 );
|
||||
TMSG3 = _mm256_add_epi64( TMSG3, TMP );
|
||||
TMSG3 = _mm256_sha512msg2_epi64( TMSG3, TMSG2 );
|
||||
MSG = _mm256_permute4x64_epi64( MSG, 0x0E );
|
||||
STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, MSG );
|
||||
TMSG1 = _mm256_sha512msg1_epi64( TMSG1, TMSG2 );
|
||||
|
||||
// Rounds 44-47
|
||||
MSG = _mm256_add_epi64( TMSG3, casti_m256i( K512, 11 ) );
|
||||
STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, MSG );
|
||||
TMP = mm256_alignr_1x64( TMSG3, TMSG2 );
|
||||
TMSG0 = _mm256_add_epi64( TMSG0, TMP );
|
||||
TMSG0 = _mm256_sha512msg2_epi64( TMSG0, TMSG3 );
|
||||
MSG = _mm256_permute4x64_epi64( MSG, 0x0E );
|
||||
STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, MSG );
|
||||
TMSG2 = _mm256_sha512msg1_epi64( TMSG2, TMSG3 );
|
||||
|
||||
// Rounds 48-51
|
||||
MSG = _mm256_add_epi64( TMSG0, casti_m256i( K512, 12 ) );
|
||||
STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, MSG );
|
||||
TMP = mm256_alignr_1x64( TMSG0, TMSG3 );
|
||||
TMSG1 = _mm256_add_epi64( TMSG1, TMP );
|
||||
TMSG1 = _mm256_sha512msg2_epi64( TMSG1, TMSG0 );
|
||||
MSG = _mm256_permute4x64_epi64( MSG, 0x0E );
|
||||
STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, MSG );
|
||||
TMSG3 = _mm256_sha512msg1_epi64( TMSG3, TMSG0 );
|
||||
|
||||
// Rounds 56-59
|
||||
MSG = _mm256_add_epi64( TMSG2, casti_m256i( K512, 14 ) );
|
||||
STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, MSG );
|
||||
TMP = mm256_alignr_1x64( TMSG2, TMSG1 ) ;
|
||||
TMSG3 = _mm256_add_epi64( TMSG3, TMP );
|
||||
TMSG3 = _mm256_sha512msg2_epi64( TMSG3, TMSG2 );
|
||||
MSG = _mm256_permute4x64_epi64( MSG, 0x0E );
|
||||
STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, MSG );
|
||||
|
||||
// Rounds 60-63
|
||||
MSG = _mm256_add_epi64( TMSG3, casti_m256i( K512, 15 ) );
|
||||
STATE1 = _mm256_sha512nds2_epi64( STATE1, STATE0, MSG );
|
||||
MSG = _mm256_permute4x64_epi64( MSG, 0x0E );
|
||||
STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, MSG );
|
||||
|
||||
// Add initial state
|
||||
STATE0 = _mm256_add_epi64( STATE0, ABEF_SAVE );
|
||||
STATE1 = _mm256_add_epi64( STATE1, CDGH_SAVE );
|
||||
|
||||
TMP = _mm256_permute4x64_epi64( STATE0, 0x1B ); // FEBA
|
||||
STATE1 = _mm256_permute4x64_epi64( STATE1, 0xB1 ); // DCHG
|
||||
STATE0 = _mm256_blend_epi32( TMP, STATE1, 0xF0 ); // DCBA
|
||||
STATE1 = _mm256_permute2x128_si256( STATE1, TMP, 0x21 ); // ABEF
|
||||
|
||||
// Save state
|
||||
_mm256_store_si256((__m256i*) &state_out[0], STATE0 );
|
||||
_mm256_store_si256((__m256i*) &state_out[4], STATE1 );
|
||||
}
|
||||
|
||||
|
||||
#endif
|
||||
|
||||
|
||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
|
||||
@@ -117,7 +489,7 @@ static const uint64_t K512[80] =
|
||||
|
||||
#define SHA3_8WAY_STEP( A, B, C, D, E, F, G, H, i ) \
|
||||
do { \
|
||||
__m512i T0 = _mm512_add_epi64( _mm512_set1_epi64( K512[i] ), W[ i ] ); \
|
||||
__m512i T0 = _mm512_add_epi64( v512_64( K512[i] ), W[ i ] ); \
|
||||
__m512i T1 = BSG8W_5_1( E ); \
|
||||
__m512i T2 = BSG8W_5_0( A ); \
|
||||
T0 = _mm512_add_epi64( T0, CH8W( E, F, G ) ); \
|
||||
@@ -155,14 +527,14 @@ sha512_8way_round( sha512_8way_context *ctx, __m512i *in, __m512i r[8] )
|
||||
}
|
||||
else
|
||||
{
|
||||
A = _mm512_set1_epi64( 0x6A09E667F3BCC908 );
|
||||
B = _mm512_set1_epi64( 0xBB67AE8584CAA73B );
|
||||
C = _mm512_set1_epi64( 0x3C6EF372FE94F82B );
|
||||
D = _mm512_set1_epi64( 0xA54FF53A5F1D36F1 );
|
||||
E = _mm512_set1_epi64( 0x510E527FADE682D1 );
|
||||
F = _mm512_set1_epi64( 0x9B05688C2B3E6C1F );
|
||||
G = _mm512_set1_epi64( 0x1F83D9ABFB41BD6B );
|
||||
H = _mm512_set1_epi64( 0x5BE0CD19137E2179 );
|
||||
A = v512_64( 0x6A09E667F3BCC908 );
|
||||
B = v512_64( 0xBB67AE8584CAA73B );
|
||||
C = v512_64( 0x3C6EF372FE94F82B );
|
||||
D = v512_64( 0xA54FF53A5F1D36F1 );
|
||||
E = v512_64( 0x510E527FADE682D1 );
|
||||
F = v512_64( 0x9B05688C2B3E6C1F );
|
||||
G = v512_64( 0x1F83D9ABFB41BD6B );
|
||||
H = v512_64( 0x5BE0CD19137E2179 );
|
||||
}
|
||||
|
||||
for ( i = 0; i < 80; i += 8 )
|
||||
@@ -191,14 +563,14 @@ sha512_8way_round( sha512_8way_context *ctx, __m512i *in, __m512i r[8] )
|
||||
else
|
||||
{
|
||||
ctx->initialized = true;
|
||||
r[0] = _mm512_add_epi64( A, _mm512_set1_epi64( 0x6A09E667F3BCC908 ) );
|
||||
r[1] = _mm512_add_epi64( B, _mm512_set1_epi64( 0xBB67AE8584CAA73B ) );
|
||||
r[2] = _mm512_add_epi64( C, _mm512_set1_epi64( 0x3C6EF372FE94F82B ) );
|
||||
r[3] = _mm512_add_epi64( D, _mm512_set1_epi64( 0xA54FF53A5F1D36F1 ) );
|
||||
r[4] = _mm512_add_epi64( E, _mm512_set1_epi64( 0x510E527FADE682D1 ) );
|
||||
r[5] = _mm512_add_epi64( F, _mm512_set1_epi64( 0x9B05688C2B3E6C1F ) );
|
||||
r[6] = _mm512_add_epi64( G, _mm512_set1_epi64( 0x1F83D9ABFB41BD6B ) );
|
||||
r[7] = _mm512_add_epi64( H, _mm512_set1_epi64( 0x5BE0CD19137E2179 ) );
|
||||
r[0] = _mm512_add_epi64( A, v512_64( 0x6A09E667F3BCC908 ) );
|
||||
r[1] = _mm512_add_epi64( B, v512_64( 0xBB67AE8584CAA73B ) );
|
||||
r[2] = _mm512_add_epi64( C, v512_64( 0x3C6EF372FE94F82B ) );
|
||||
r[3] = _mm512_add_epi64( D, v512_64( 0xA54FF53A5F1D36F1 ) );
|
||||
r[4] = _mm512_add_epi64( E, v512_64( 0x510E527FADE682D1 ) );
|
||||
r[5] = _mm512_add_epi64( F, v512_64( 0x9B05688C2B3E6C1F ) );
|
||||
r[6] = _mm512_add_epi64( G, v512_64( 0x1F83D9ABFB41BD6B ) );
|
||||
r[7] = _mm512_add_epi64( H, v512_64( 0x5BE0CD19137E2179 ) );
|
||||
}
|
||||
}
|
||||
|
||||
@@ -243,7 +615,7 @@ void sha512_8way_close( sha512_8way_context *sc, void *dst )
|
||||
0x08090a0b0c0d0e0f, 0x0001020304050607 ) );
|
||||
|
||||
ptr = (unsigned)sc->count & (buf_size - 1U);
|
||||
sc->buf[ ptr>>3 ] = _mm512_set1_epi64( 0x80 );
|
||||
sc->buf[ ptr>>3 ] = v512_64( 0x80 );
|
||||
ptr += 8;
|
||||
if ( ptr > pad )
|
||||
{
|
||||
@@ -255,9 +627,9 @@ void sha512_8way_close( sha512_8way_context *sc, void *dst )
|
||||
memset_zero_512( sc->buf + (ptr>>3), (pad - ptr) >> 3 );
|
||||
|
||||
sc->buf[ pad >> 3 ] = _mm512_shuffle_epi8(
|
||||
_mm512_set1_epi64( sc->count >> 61 ), shuff_bswap64 );
|
||||
v512_64( sc->count >> 61 ), shuff_bswap64 );
|
||||
sc->buf[ ( pad+8 ) >> 3 ] = _mm512_shuffle_epi8(
|
||||
_mm512_set1_epi64( sc->count << 3 ), shuff_bswap64 );
|
||||
v512_64( sc->count << 3 ), shuff_bswap64 );
|
||||
sha512_8way_round( sc, sc->buf, sc->val );
|
||||
|
||||
mm512_block_bswap_64( dst, sc->val );
|
||||
@@ -295,7 +667,7 @@ void sha512_8way_close( sha512_8way_context *sc, void *dst )
|
||||
|
||||
#define SHA3_4WAY_STEP( A, B, C, D, E, F, G, H, i ) \
|
||||
do { \
|
||||
__m256i T0 = _mm256_add_epi64( _mm256_set1_epi64x( K512[i] ), W[i] ); \
|
||||
__m256i T0 = _mm256_add_epi64( v256_64( K512[i] ), W[i] ); \
|
||||
__m256i T1 = BSG5_1( E ); \
|
||||
__m256i T2 = BSG5_0( A ); \
|
||||
T0 = _mm256_add_epi64( T0, CH( E, F, G ) ); \
|
||||
@@ -317,7 +689,7 @@ do { \
|
||||
|
||||
#define SHA3_4WAY_STEP( A, B, C, D, E, F, G, H, i ) \
|
||||
do { \
|
||||
__m256i T0 = _mm256_add_epi64( _mm256_set1_epi64x( K512[i] ), W[i] ); \
|
||||
__m256i T0 = _mm256_add_epi64( v256_64( K512[i] ), W[i] ); \
|
||||
__m256i T1 = BSG5_1( E ); \
|
||||
__m256i T2 = BSG5_0( A ); \
|
||||
T0 = _mm256_add_epi64( T0, CH( E, F, G ) ); \
|
||||
@@ -364,14 +736,14 @@ sha512_4way_round( sha512_4way_context *ctx, __m256i *in, __m256i r[8] )
|
||||
}
|
||||
else
|
||||
{
|
||||
A = _mm256_set1_epi64x( 0x6A09E667F3BCC908 );
|
||||
B = _mm256_set1_epi64x( 0xBB67AE8584CAA73B );
|
||||
C = _mm256_set1_epi64x( 0x3C6EF372FE94F82B );
|
||||
D = _mm256_set1_epi64x( 0xA54FF53A5F1D36F1 );
|
||||
E = _mm256_set1_epi64x( 0x510E527FADE682D1 );
|
||||
F = _mm256_set1_epi64x( 0x9B05688C2B3E6C1F );
|
||||
G = _mm256_set1_epi64x( 0x1F83D9ABFB41BD6B );
|
||||
H = _mm256_set1_epi64x( 0x5BE0CD19137E2179 );
|
||||
A = v256_64( 0x6A09E667F3BCC908 );
|
||||
B = v256_64( 0xBB67AE8584CAA73B );
|
||||
C = v256_64( 0x3C6EF372FE94F82B );
|
||||
D = v256_64( 0xA54FF53A5F1D36F1 );
|
||||
E = v256_64( 0x510E527FADE682D1 );
|
||||
F = v256_64( 0x9B05688C2B3E6C1F );
|
||||
G = v256_64( 0x1F83D9ABFB41BD6B );
|
||||
H = v256_64( 0x5BE0CD19137E2179 );
|
||||
}
|
||||
|
||||
#if !defined(__AVX512VL__)
|
||||
@@ -405,14 +777,14 @@ sha512_4way_round( sha512_4way_context *ctx, __m256i *in, __m256i r[8] )
|
||||
else
|
||||
{
|
||||
ctx->initialized = true;
|
||||
r[0] = _mm256_add_epi64( A, _mm256_set1_epi64x( 0x6A09E667F3BCC908 ) );
|
||||
r[1] = _mm256_add_epi64( B, _mm256_set1_epi64x( 0xBB67AE8584CAA73B ) );
|
||||
r[2] = _mm256_add_epi64( C, _mm256_set1_epi64x( 0x3C6EF372FE94F82B ) );
|
||||
r[3] = _mm256_add_epi64( D, _mm256_set1_epi64x( 0xA54FF53A5F1D36F1 ) );
|
||||
r[4] = _mm256_add_epi64( E, _mm256_set1_epi64x( 0x510E527FADE682D1 ) );
|
||||
r[5] = _mm256_add_epi64( F, _mm256_set1_epi64x( 0x9B05688C2B3E6C1F ) );
|
||||
r[6] = _mm256_add_epi64( G, _mm256_set1_epi64x( 0x1F83D9ABFB41BD6B ) );
|
||||
r[7] = _mm256_add_epi64( H, _mm256_set1_epi64x( 0x5BE0CD19137E2179 ) );
|
||||
r[0] = _mm256_add_epi64( A, v256_64( 0x6A09E667F3BCC908 ) );
|
||||
r[1] = _mm256_add_epi64( B, v256_64( 0xBB67AE8584CAA73B ) );
|
||||
r[2] = _mm256_add_epi64( C, v256_64( 0x3C6EF372FE94F82B ) );
|
||||
r[3] = _mm256_add_epi64( D, v256_64( 0xA54FF53A5F1D36F1 ) );
|
||||
r[4] = _mm256_add_epi64( E, v256_64( 0x510E527FADE682D1 ) );
|
||||
r[5] = _mm256_add_epi64( F, v256_64( 0x9B05688C2B3E6C1F ) );
|
||||
r[6] = _mm256_add_epi64( G, v256_64( 0x1F83D9ABFB41BD6B ) );
|
||||
r[7] = _mm256_add_epi64( H, v256_64( 0x5BE0CD19137E2179 ) );
|
||||
}
|
||||
}
|
||||
|
||||
@@ -457,7 +829,7 @@ void sha512_4way_close( sha512_4way_context *sc, void *dst )
|
||||
0x08090a0b0c0d0e0f, 0x0001020304050607 ) );
|
||||
|
||||
ptr = (unsigned)sc->count & (buf_size - 1U);
|
||||
sc->buf[ ptr>>3 ] = _mm256_set1_epi64x( 0x80 );
|
||||
sc->buf[ ptr>>3 ] = v256_64( 0x80 );
|
||||
ptr += 8;
|
||||
if ( ptr > pad )
|
||||
{
|
||||
@@ -469,9 +841,9 @@ void sha512_4way_close( sha512_4way_context *sc, void *dst )
|
||||
memset_zero_256( sc->buf + (ptr>>3), (pad - ptr) >> 3 );
|
||||
|
||||
sc->buf[ pad >> 3 ] = _mm256_shuffle_epi8(
|
||||
_mm256_set1_epi64x( sc->count >> 61 ), shuff_bswap64 );
|
||||
v256_64( sc->count >> 61 ), shuff_bswap64 );
|
||||
sc->buf[ ( pad+8 ) >> 3 ] = _mm256_shuffle_epi8(
|
||||
_mm256_set1_epi64x( sc->count << 3 ), shuff_bswap64 );
|
||||
v256_64( sc->count << 3 ), shuff_bswap64 );
|
||||
sha512_4way_round( sc, sc->buf, sc->val );
|
||||
|
||||
mm256_block_bswap_64( dst, sc->val );
|
||||
|
@@ -5,11 +5,32 @@
|
||||
#include "simd-utils.h"
|
||||
#include "sph_sha2.h"
|
||||
|
||||
#if defined(__SHA512__) && defined(__AVX2__)
|
||||
|
||||
// Experimental, untested
|
||||
// Need to substitute for sph_sha512
|
||||
|
||||
typedef struct
|
||||
{
|
||||
uint64_t buf[128>>3];
|
||||
uint64_t val[8];
|
||||
uint64_t count;
|
||||
} sha512_context __attribute__ ((aligned (64)));
|
||||
|
||||
void sha512_opt_transform_be( uint64_t *state_out, const void *input,
|
||||
const uint64_t *state_in );
|
||||
|
||||
void sha512_opt_transform_le( uint64_t *state_out, const void *input,
|
||||
const uint64_t *state_in );
|
||||
|
||||
#endif
|
||||
|
||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
|
||||
// SHA-512 8 way
|
||||
|
||||
typedef struct {
|
||||
typedef struct
|
||||
{
|
||||
__m512i buf[128>>3];
|
||||
__m512i val[8];
|
||||
uint64_t count;
|
||||
@@ -28,7 +49,8 @@ void sha512_8way_full( void *dst, const void *data, size_t len );
|
||||
|
||||
// SHA-512 4 way
|
||||
|
||||
typedef struct {
|
||||
typedef struct
|
||||
{
|
||||
__m256i buf[128>>3];
|
||||
__m256i val[8];
|
||||
uint64_t count;
|
||||
|
@@ -16,14 +16,14 @@ static void sha512256d_8way_init( sha512_8way_context *ctx )
|
||||
{
|
||||
ctx->count = 0;
|
||||
ctx->initialized = true;
|
||||
ctx->val[0] = _mm512_set1_epi64( 0x22312194FC2BF72C );
|
||||
ctx->val[1] = _mm512_set1_epi64( 0x9F555FA3C84C64C2 );
|
||||
ctx->val[2] = _mm512_set1_epi64( 0x2393B86B6F53B151 );
|
||||
ctx->val[3] = _mm512_set1_epi64( 0x963877195940EABD );
|
||||
ctx->val[4] = _mm512_set1_epi64( 0x96283EE2A88EFFE3 );
|
||||
ctx->val[5] = _mm512_set1_epi64( 0xBE5E1E2553863992 );
|
||||
ctx->val[6] = _mm512_set1_epi64( 0x2B0199FC2C85B8AA );
|
||||
ctx->val[7] = _mm512_set1_epi64( 0x0EB72DDC81C52CA2 );
|
||||
ctx->val[0] = v512_64( 0x22312194FC2BF72C );
|
||||
ctx->val[1] = v512_64( 0x9F555FA3C84C64C2 );
|
||||
ctx->val[2] = v512_64( 0x2393B86B6F53B151 );
|
||||
ctx->val[3] = v512_64( 0x963877195940EABD );
|
||||
ctx->val[4] = v512_64( 0x96283EE2A88EFFE3 );
|
||||
ctx->val[5] = v512_64( 0xBE5E1E2553863992 );
|
||||
ctx->val[6] = v512_64( 0x2B0199FC2C85B8AA );
|
||||
ctx->val[7] = v512_64( 0x0EB72DDC81C52CA2 );
|
||||
}
|
||||
|
||||
int scanhash_sha512256d_8way( struct work *work, uint32_t max_nonce,
|
||||
@@ -43,7 +43,7 @@ int scanhash_sha512256d_8way( struct work *work, uint32_t max_nonce,
|
||||
__m512i *noncev = (__m512i*)vdata + 9;
|
||||
const int thr_id = mythr->id;
|
||||
const bool bench = opt_benchmark;
|
||||
const __m512i eight = _mm512_set1_epi64( 0x0000000800000000 );
|
||||
const __m512i eight = v512_64( 0x0000000800000000 );
|
||||
|
||||
mm512_bswap32_intrlv80_8x64( vdata, pdata );
|
||||
*noncev = mm512_intrlv_blend_32(
|
||||
@@ -84,14 +84,14 @@ static void sha512256d_4way_init( sha512_4way_context *ctx )
|
||||
{
|
||||
ctx->count = 0;
|
||||
ctx->initialized = true;
|
||||
ctx->val[0] = _mm256_set1_epi64x( 0x22312194FC2BF72C );
|
||||
ctx->val[1] = _mm256_set1_epi64x( 0x9F555FA3C84C64C2 );
|
||||
ctx->val[2] = _mm256_set1_epi64x( 0x2393B86B6F53B151 );
|
||||
ctx->val[3] = _mm256_set1_epi64x( 0x963877195940EABD );
|
||||
ctx->val[4] = _mm256_set1_epi64x( 0x96283EE2A88EFFE3 );
|
||||
ctx->val[5] = _mm256_set1_epi64x( 0xBE5E1E2553863992 );
|
||||
ctx->val[6] = _mm256_set1_epi64x( 0x2B0199FC2C85B8AA );
|
||||
ctx->val[7] = _mm256_set1_epi64x( 0x0EB72DDC81C52CA2 );
|
||||
ctx->val[0] = v256_64( 0x22312194FC2BF72C );
|
||||
ctx->val[1] = v256_64( 0x9F555FA3C84C64C2 );
|
||||
ctx->val[2] = v256_64( 0x2393B86B6F53B151 );
|
||||
ctx->val[3] = v256_64( 0x963877195940EABD );
|
||||
ctx->val[4] = v256_64( 0x96283EE2A88EFFE3 );
|
||||
ctx->val[5] = v256_64( 0xBE5E1E2553863992 );
|
||||
ctx->val[6] = v256_64( 0x2B0199FC2C85B8AA );
|
||||
ctx->val[7] = v256_64( 0x0EB72DDC81C52CA2 );
|
||||
}
|
||||
|
||||
int scanhash_sha512256d_4way( struct work *work, uint32_t max_nonce,
|
||||
@@ -111,7 +111,7 @@ int scanhash_sha512256d_4way( struct work *work, uint32_t max_nonce,
|
||||
__m256i *noncev = (__m256i*)vdata + 9;
|
||||
const int thr_id = mythr->id;
|
||||
const bool bench = opt_benchmark;
|
||||
const __m256i four = _mm256_set1_epi64x( 0x0000000400000000 );
|
||||
const __m256i four = v256_64( 0x0000000400000000 );
|
||||
|
||||
mm256_bswap32_intrlv80_4x64( vdata, pdata );
|
||||
*noncev = mm256_intrlv_blend_32(
|
||||
|
Reference in New Issue
Block a user