mirror of
https://github.com/JayDDee/cpuminer-opt.git
synced 2025-09-17 23:44:27 +00:00
v3.23.0
This commit is contained in:
@@ -308,7 +308,52 @@ static const sph_u32 CS[16] = {
|
||||
/////////////////////////////////////////
|
||||
//
|
||||
// Blake-256 1 way SIMD
|
||||
// Only used for prehash, otherwise 4way is used with SSE2.
|
||||
|
||||
// optimize shuffles to reduce latency caused by dependencies on V1.
|
||||
#define BLAKE256_ROUND( r ) \
|
||||
{ \
|
||||
V0 = _mm_add_epi32( V0, _mm_add_epi32( V1, \
|
||||
_mm_set_epi32( CSx( r, 7 ) ^ Mx( r, 6 ), \
|
||||
CSx( r, 5 ) ^ Mx( r, 4 ), \
|
||||
CSx( r, 3 ) ^ Mx( r, 2 ), \
|
||||
CSx( r, 1 ) ^ Mx( r, 0 ) ) ) ); \
|
||||
V3 = mm128_swap32_16( _mm_xor_si128( V3, V0 ) ); \
|
||||
V2 = _mm_add_epi32( V2, V3 ); \
|
||||
V1 = mm128_ror_32( _mm_xor_si128( V1, V2 ), 12 ); \
|
||||
V0 = _mm_add_epi32( V0, _mm_add_epi32( V1, \
|
||||
_mm_set_epi32( CSx( r, 6 ) ^ Mx( r, 7 ), \
|
||||
CSx( r, 4 ) ^ Mx( r, 5 ), \
|
||||
CSx( r, 2 ) ^ Mx( r, 3 ), \
|
||||
CSx( r, 0 ) ^ Mx( r, 1 ) ) ) ); \
|
||||
V3 = mm128_shuflr32_8( _mm_xor_si128( V3, V0 ) ); \
|
||||
V2 = _mm_add_epi32( V2, V3 ); \
|
||||
V1 = mm128_ror_32( _mm_xor_si128( V1, V2 ), 7 ); \
|
||||
V0 = mm128_shufll_32( V0 ); \
|
||||
V3 = mm128_swap_64( V3 ); \
|
||||
V2 = mm128_shuflr_32( V2 ); \
|
||||
V0 = _mm_add_epi32( V0, _mm_add_epi32( V1, \
|
||||
_mm_set_epi32( CSx( r, D ) ^ Mx( r, C ), \
|
||||
CSx( r, B ) ^ Mx( r, A ), \
|
||||
CSx( r, 9 ) ^ Mx( r, 8 ), \
|
||||
CSx( r, F ) ^ Mx( r, E ) ) ) ); \
|
||||
V3 = mm128_swap32_16( _mm_xor_si128( V3, V0 ) ); \
|
||||
V2 = _mm_add_epi32( V2, V3 ); \
|
||||
V1 = mm128_ror_32( _mm_xor_si128( V1, V2 ), 12 ); \
|
||||
V0 = _mm_add_epi32( V0, _mm_add_epi32( V1, \
|
||||
_mm_set_epi32( CSx( r, C ) ^ Mx( r, D ), \
|
||||
CSx( r, A ) ^ Mx( r, B ), \
|
||||
CSx( r, 8 ) ^ Mx( r, 9 ), \
|
||||
CSx( r, E ) ^ Mx( r, F ) ) ) ); \
|
||||
V3 = mm128_shuflr32_8( _mm_xor_si128( V3, V0 ) ); \
|
||||
V2 = _mm_add_epi32( V2, V3 ); \
|
||||
V1 = mm128_ror_32( _mm_xor_si128( V1, V2 ), 7 ); \
|
||||
V0 = mm128_shuflr_32( V0 ); \
|
||||
V3 = mm128_swap_64( V3 ); \
|
||||
V2 = mm128_shufll_32( V2 ); \
|
||||
}
|
||||
|
||||
/*
|
||||
#define BLAKE256_ROUND( r ) \
|
||||
{ \
|
||||
V0 = _mm_add_epi32( V0, _mm_add_epi32( V1, \
|
||||
@@ -350,6 +395,7 @@ static const sph_u32 CS[16] = {
|
||||
V2 = mm128_swap_64( V2 ); \
|
||||
V1 = mm128_shufll_32( V1 ); \
|
||||
}
|
||||
*/
|
||||
|
||||
void blake256_transform_le( uint32_t *H, const uint32_t *buf,
|
||||
const uint32_t T0, const uint32_t T1 )
|
||||
|
||||
@@ -252,14 +252,14 @@ static void blake2b_8way_compress( blake2b_8way_ctx *ctx, int last )
|
||||
v[ 5] = ctx->h[5];
|
||||
v[ 6] = ctx->h[6];
|
||||
v[ 7] = ctx->h[7];
|
||||
v[ 8] = m512_const1_64( 0x6A09E667F3BCC908 );
|
||||
v[ 9] = m512_const1_64( 0xBB67AE8584CAA73B );
|
||||
v[10] = m512_const1_64( 0x3C6EF372FE94F82B );
|
||||
v[11] = m512_const1_64( 0xA54FF53A5F1D36F1 );
|
||||
v[12] = m512_const1_64( 0x510E527FADE682D1 );
|
||||
v[13] = m512_const1_64( 0x9B05688C2B3E6C1F );
|
||||
v[14] = m512_const1_64( 0x1F83D9ABFB41BD6B );
|
||||
v[15] = m512_const1_64( 0x5BE0CD19137E2179 );
|
||||
v[ 8] = _mm512_set1_epi64( 0x6A09E667F3BCC908 );
|
||||
v[ 9] = _mm512_set1_epi64( 0xBB67AE8584CAA73B );
|
||||
v[10] = _mm512_set1_epi64( 0x3C6EF372FE94F82B );
|
||||
v[11] = _mm512_set1_epi64( 0xA54FF53A5F1D36F1 );
|
||||
v[12] = _mm512_set1_epi64( 0x510E527FADE682D1 );
|
||||
v[13] = _mm512_set1_epi64( 0x9B05688C2B3E6C1F );
|
||||
v[14] = _mm512_set1_epi64( 0x1F83D9ABFB41BD6B );
|
||||
v[15] = _mm512_set1_epi64( 0x5BE0CD19137E2179 );
|
||||
|
||||
v[12] = _mm512_xor_si512( v[12], _mm512_set1_epi64( ctx->t[0] ) );
|
||||
v[13] = _mm512_xor_si512( v[13], _mm512_set1_epi64( ctx->t[1] ) );
|
||||
@@ -310,16 +310,16 @@ int blake2b_8way_init( blake2b_8way_ctx *ctx )
|
||||
{
|
||||
size_t i;
|
||||
|
||||
ctx->h[0] = m512_const1_64( 0x6A09E667F3BCC908 );
|
||||
ctx->h[1] = m512_const1_64( 0xBB67AE8584CAA73B );
|
||||
ctx->h[2] = m512_const1_64( 0x3C6EF372FE94F82B );
|
||||
ctx->h[3] = m512_const1_64( 0xA54FF53A5F1D36F1 );
|
||||
ctx->h[4] = m512_const1_64( 0x510E527FADE682D1 );
|
||||
ctx->h[5] = m512_const1_64( 0x9B05688C2B3E6C1F );
|
||||
ctx->h[6] = m512_const1_64( 0x1F83D9ABFB41BD6B );
|
||||
ctx->h[7] = m512_const1_64( 0x5BE0CD19137E2179 );
|
||||
ctx->h[0] = _mm512_set1_epi64( 0x6A09E667F3BCC908 );
|
||||
ctx->h[1] = _mm512_set1_epi64( 0xBB67AE8584CAA73B );
|
||||
ctx->h[2] = _mm512_set1_epi64( 0x3C6EF372FE94F82B );
|
||||
ctx->h[3] = _mm512_set1_epi64( 0xA54FF53A5F1D36F1 );
|
||||
ctx->h[4] = _mm512_set1_epi64( 0x510E527FADE682D1 );
|
||||
ctx->h[5] = _mm512_set1_epi64( 0x9B05688C2B3E6C1F );
|
||||
ctx->h[6] = _mm512_set1_epi64( 0x1F83D9ABFB41BD6B );
|
||||
ctx->h[7] = _mm512_set1_epi64( 0x5BE0CD19137E2179 );
|
||||
|
||||
ctx->h[0] = _mm512_xor_si512( ctx->h[0], m512_const1_64( 0x01010020 ) );
|
||||
ctx->h[0] = _mm512_xor_si512( ctx->h[0], _mm512_set1_epi64( 0x01010020 ) );
|
||||
|
||||
ctx->t[0] = 0;
|
||||
ctx->t[1] = 0;
|
||||
@@ -419,14 +419,14 @@ static void blake2b_4way_compress( blake2b_4way_ctx *ctx, int last )
|
||||
v[ 5] = ctx->h[5];
|
||||
v[ 6] = ctx->h[6];
|
||||
v[ 7] = ctx->h[7];
|
||||
v[ 8] = m256_const1_64( 0x6A09E667F3BCC908 );
|
||||
v[ 9] = m256_const1_64( 0xBB67AE8584CAA73B );
|
||||
v[10] = m256_const1_64( 0x3C6EF372FE94F82B );
|
||||
v[11] = m256_const1_64( 0xA54FF53A5F1D36F1 );
|
||||
v[12] = m256_const1_64( 0x510E527FADE682D1 );
|
||||
v[13] = m256_const1_64( 0x9B05688C2B3E6C1F );
|
||||
v[14] = m256_const1_64( 0x1F83D9ABFB41BD6B );
|
||||
v[15] = m256_const1_64( 0x5BE0CD19137E2179 );
|
||||
v[ 8] = _mm256_set1_epi64x( 0x6A09E667F3BCC908 );
|
||||
v[ 9] = _mm256_set1_epi64x( 0xBB67AE8584CAA73B );
|
||||
v[10] = _mm256_set1_epi64x( 0x3C6EF372FE94F82B );
|
||||
v[11] = _mm256_set1_epi64x( 0xA54FF53A5F1D36F1 );
|
||||
v[12] = _mm256_set1_epi64x( 0x510E527FADE682D1 );
|
||||
v[13] = _mm256_set1_epi64x( 0x9B05688C2B3E6C1F );
|
||||
v[14] = _mm256_set1_epi64x( 0x1F83D9ABFB41BD6B );
|
||||
v[15] = _mm256_set1_epi64x( 0x5BE0CD19137E2179 );
|
||||
|
||||
v[12] = _mm256_xor_si256( v[12], _mm256_set1_epi64x( ctx->t[0] ) );
|
||||
v[13] = _mm256_xor_si256( v[13], _mm256_set1_epi64x( ctx->t[1] ) );
|
||||
@@ -477,16 +477,16 @@ int blake2b_4way_init( blake2b_4way_ctx *ctx )
|
||||
{
|
||||
size_t i;
|
||||
|
||||
ctx->h[0] = m256_const1_64( 0x6A09E667F3BCC908 );
|
||||
ctx->h[1] = m256_const1_64( 0xBB67AE8584CAA73B );
|
||||
ctx->h[2] = m256_const1_64( 0x3C6EF372FE94F82B );
|
||||
ctx->h[3] = m256_const1_64( 0xA54FF53A5F1D36F1 );
|
||||
ctx->h[4] = m256_const1_64( 0x510E527FADE682D1 );
|
||||
ctx->h[5] = m256_const1_64( 0x9B05688C2B3E6C1F );
|
||||
ctx->h[6] = m256_const1_64( 0x1F83D9ABFB41BD6B );
|
||||
ctx->h[7] = m256_const1_64( 0x5BE0CD19137E2179 );
|
||||
ctx->h[0] = _mm256_set1_epi64x( 0x6A09E667F3BCC908 );
|
||||
ctx->h[1] = _mm256_set1_epi64x( 0xBB67AE8584CAA73B );
|
||||
ctx->h[2] = _mm256_set1_epi64x( 0x3C6EF372FE94F82B );
|
||||
ctx->h[3] = _mm256_set1_epi64x( 0xA54FF53A5F1D36F1 );
|
||||
ctx->h[4] = _mm256_set1_epi64x( 0x510E527FADE682D1 );
|
||||
ctx->h[5] = _mm256_set1_epi64x( 0x9B05688C2B3E6C1F );
|
||||
ctx->h[6] = _mm256_set1_epi64x( 0x1F83D9ABFB41BD6B );
|
||||
ctx->h[7] = _mm256_set1_epi64x( 0x5BE0CD19137E2179 );
|
||||
|
||||
ctx->h[0] = _mm256_xor_si256( ctx->h[0], m256_const1_64( 0x01010020 ) );
|
||||
ctx->h[0] = _mm256_xor_si256( ctx->h[0], _mm256_set1_epi64x( 0x01010020 ) );
|
||||
|
||||
ctx->t[0] = 0;
|
||||
ctx->t[1] = 0;
|
||||
|
||||
@@ -62,14 +62,14 @@ int blake2s_4way_init( blake2s_4way_state *S, const uint8_t outlen )
|
||||
|
||||
memset( S, 0, sizeof( blake2s_4way_state ) );
|
||||
|
||||
S->h[0] = m128_const1_64( 0x6A09E6676A09E667ULL );
|
||||
S->h[1] = m128_const1_64( 0xBB67AE85BB67AE85ULL );
|
||||
S->h[2] = m128_const1_64( 0x3C6EF3723C6EF372ULL );
|
||||
S->h[3] = m128_const1_64( 0xA54FF53AA54FF53AULL );
|
||||
S->h[4] = m128_const1_64( 0x510E527F510E527FULL );
|
||||
S->h[5] = m128_const1_64( 0x9B05688C9B05688CULL );
|
||||
S->h[6] = m128_const1_64( 0x1F83D9AB1F83D9ABULL );
|
||||
S->h[7] = m128_const1_64( 0x5BE0CD195BE0CD19ULL );
|
||||
S->h[0] = _mm_set1_epi64x( 0x6A09E6676A09E667ULL );
|
||||
S->h[1] = _mm_set1_epi64x( 0xBB67AE85BB67AE85ULL );
|
||||
S->h[2] = _mm_set1_epi64x( 0x3C6EF3723C6EF372ULL );
|
||||
S->h[3] = _mm_set1_epi64x( 0xA54FF53AA54FF53AULL );
|
||||
S->h[4] = _mm_set1_epi64x( 0x510E527F510E527FULL );
|
||||
S->h[5] = _mm_set1_epi64x( 0x9B05688C9B05688CULL );
|
||||
S->h[6] = _mm_set1_epi64x( 0x1F83D9AB1F83D9ABULL );
|
||||
S->h[7] = _mm_set1_epi64x( 0x5BE0CD195BE0CD19ULL );
|
||||
|
||||
// for( int i = 0; i < 8; ++i )
|
||||
// S->h[i] = _mm_set1_epi32( blake2s_IV[i] );
|
||||
@@ -90,18 +90,18 @@ int blake2s_4way_compress( blake2s_4way_state *S, const __m128i* block )
|
||||
memcpy_128( m, block, 16 );
|
||||
memcpy_128( v, S->h, 8 );
|
||||
|
||||
v[ 8] = m128_const1_64( 0x6A09E6676A09E667ULL );
|
||||
v[ 9] = m128_const1_64( 0xBB67AE85BB67AE85ULL );
|
||||
v[10] = m128_const1_64( 0x3C6EF3723C6EF372ULL );
|
||||
v[11] = m128_const1_64( 0xA54FF53AA54FF53AULL );
|
||||
v[ 8] = _mm_set1_epi64x( 0x6A09E6676A09E667ULL );
|
||||
v[ 9] = _mm_set1_epi64x( 0xBB67AE85BB67AE85ULL );
|
||||
v[10] = _mm_set1_epi64x( 0x3C6EF3723C6EF372ULL );
|
||||
v[11] = _mm_set1_epi64x( 0xA54FF53AA54FF53AULL );
|
||||
v[12] = _mm_xor_si128( _mm_set1_epi32( S->t[0] ),
|
||||
m128_const1_64( 0x510E527F510E527FULL ) );
|
||||
_mm_set1_epi64x( 0x510E527F510E527FULL ) );
|
||||
v[13] = _mm_xor_si128( _mm_set1_epi32( S->t[1] ),
|
||||
m128_const1_64( 0x9B05688C9B05688CULL ) );
|
||||
_mm_set1_epi64x( 0x9B05688C9B05688CULL ) );
|
||||
v[14] = _mm_xor_si128( _mm_set1_epi32( S->f[0] ),
|
||||
m128_const1_64( 0x1F83D9AB1F83D9ABULL ) );
|
||||
_mm_set1_epi64x( 0x1F83D9AB1F83D9ABULL ) );
|
||||
v[15] = _mm_xor_si128( _mm_set1_epi32( S->f[1] ),
|
||||
m128_const1_64( 0x5BE0CD195BE0CD19ULL ) );
|
||||
_mm_set1_epi64x( 0x5BE0CD195BE0CD19ULL ) );
|
||||
|
||||
#define G4W( sigma0, sigma1, a, b, c, d ) \
|
||||
do { \
|
||||
@@ -269,21 +269,21 @@ int blake2s_8way_compress( blake2s_8way_state *S, const __m256i *block )
|
||||
memcpy_256( m, block, 16 );
|
||||
memcpy_256( v, S->h, 8 );
|
||||
|
||||
v[ 8] = m256_const1_64( 0x6A09E6676A09E667ULL );
|
||||
v[ 9] = m256_const1_64( 0xBB67AE85BB67AE85ULL );
|
||||
v[10] = m256_const1_64( 0x3C6EF3723C6EF372ULL );
|
||||
v[11] = m256_const1_64( 0xA54FF53AA54FF53AULL );
|
||||
v[ 8] = _mm256_set1_epi64x( 0x6A09E6676A09E667ULL );
|
||||
v[ 9] = _mm256_set1_epi64x( 0xBB67AE85BB67AE85ULL );
|
||||
v[10] = _mm256_set1_epi64x( 0x3C6EF3723C6EF372ULL );
|
||||
v[11] = _mm256_set1_epi64x( 0xA54FF53AA54FF53AULL );
|
||||
v[12] = _mm256_xor_si256( _mm256_set1_epi32( S->t[0] ),
|
||||
m256_const1_64( 0x510E527F510E527FULL ) );
|
||||
_mm256_set1_epi64x( 0x510E527F510E527FULL ) );
|
||||
|
||||
v[13] = _mm256_xor_si256( _mm256_set1_epi32( S->t[1] ),
|
||||
m256_const1_64( 0x9B05688C9B05688CULL ) );
|
||||
_mm256_set1_epi64x( 0x9B05688C9B05688CULL ) );
|
||||
|
||||
v[14] = _mm256_xor_si256( _mm256_set1_epi32( S->f[0] ),
|
||||
m256_const1_64( 0x1F83D9AB1F83D9ABULL ) );
|
||||
_mm256_set1_epi64x( 0x1F83D9AB1F83D9ABULL ) );
|
||||
|
||||
v[15] = _mm256_xor_si256( _mm256_set1_epi32( S->f[1] ),
|
||||
m256_const1_64( 0x5BE0CD195BE0CD19ULL ) );
|
||||
_mm256_set1_epi64x( 0x5BE0CD195BE0CD19ULL ) );
|
||||
|
||||
/*
|
||||
v[ 8] = _mm256_set1_epi32( blake2s_IV[0] );
|
||||
@@ -391,14 +391,14 @@ int blake2s_8way_init( blake2s_8way_state *S, const uint8_t outlen )
|
||||
memset( P->personal, 0, sizeof( P->personal ) );
|
||||
|
||||
memset( S, 0, sizeof( blake2s_8way_state ) );
|
||||
S->h[0] = m256_const1_64( 0x6A09E6676A09E667ULL );
|
||||
S->h[1] = m256_const1_64( 0xBB67AE85BB67AE85ULL );
|
||||
S->h[2] = m256_const1_64( 0x3C6EF3723C6EF372ULL );
|
||||
S->h[3] = m256_const1_64( 0xA54FF53AA54FF53AULL );
|
||||
S->h[4] = m256_const1_64( 0x510E527F510E527FULL );
|
||||
S->h[5] = m256_const1_64( 0x9B05688C9B05688CULL );
|
||||
S->h[6] = m256_const1_64( 0x1F83D9AB1F83D9ABULL );
|
||||
S->h[7] = m256_const1_64( 0x5BE0CD195BE0CD19ULL );
|
||||
S->h[0] = _mm256_set1_epi64x( 0x6A09E6676A09E667ULL );
|
||||
S->h[1] = _mm256_set1_epi64x( 0xBB67AE85BB67AE85ULL );
|
||||
S->h[2] = _mm256_set1_epi64x( 0x3C6EF3723C6EF372ULL );
|
||||
S->h[3] = _mm256_set1_epi64x( 0xA54FF53AA54FF53AULL );
|
||||
S->h[4] = _mm256_set1_epi64x( 0x510E527F510E527FULL );
|
||||
S->h[5] = _mm256_set1_epi64x( 0x9B05688C9B05688CULL );
|
||||
S->h[6] = _mm256_set1_epi64x( 0x1F83D9AB1F83D9ABULL );
|
||||
S->h[7] = _mm256_set1_epi64x( 0x5BE0CD195BE0CD19ULL );
|
||||
|
||||
|
||||
// for( int i = 0; i < 8; ++i )
|
||||
@@ -510,21 +510,21 @@ int blake2s_16way_compress( blake2s_16way_state *S, const __m512i *block )
|
||||
memcpy_512( m, block, 16 );
|
||||
memcpy_512( v, S->h, 8 );
|
||||
|
||||
v[ 8] = m512_const1_64( 0x6A09E6676A09E667ULL );
|
||||
v[ 9] = m512_const1_64( 0xBB67AE85BB67AE85ULL );
|
||||
v[10] = m512_const1_64( 0x3C6EF3723C6EF372ULL );
|
||||
v[11] = m512_const1_64( 0xA54FF53AA54FF53AULL );
|
||||
v[ 8] = _mm512_set1_epi64( 0x6A09E6676A09E667ULL );
|
||||
v[ 9] = _mm512_set1_epi64( 0xBB67AE85BB67AE85ULL );
|
||||
v[10] = _mm512_set1_epi64( 0x3C6EF3723C6EF372ULL );
|
||||
v[11] = _mm512_set1_epi64( 0xA54FF53AA54FF53AULL );
|
||||
v[12] = _mm512_xor_si512( _mm512_set1_epi32( S->t[0] ),
|
||||
m512_const1_64( 0x510E527F510E527FULL ) );
|
||||
_mm512_set1_epi64( 0x510E527F510E527FULL ) );
|
||||
|
||||
v[13] = _mm512_xor_si512( _mm512_set1_epi32( S->t[1] ),
|
||||
m512_const1_64( 0x9B05688C9B05688CULL ) );
|
||||
_mm512_set1_epi64( 0x9B05688C9B05688CULL ) );
|
||||
|
||||
v[14] = _mm512_xor_si512( _mm512_set1_epi32( S->f[0] ),
|
||||
m512_const1_64( 0x1F83D9AB1F83D9ABULL ) );
|
||||
_mm512_set1_epi64( 0x1F83D9AB1F83D9ABULL ) );
|
||||
|
||||
v[15] = _mm512_xor_si512( _mm512_set1_epi32( S->f[1] ),
|
||||
m512_const1_64( 0x5BE0CD195BE0CD19ULL ) );
|
||||
_mm512_set1_epi64( 0x5BE0CD195BE0CD19ULL ) );
|
||||
|
||||
|
||||
#define G16W( sigma0, sigma1, a, b, c, d) \
|
||||
@@ -589,14 +589,14 @@ int blake2s_16way_init( blake2s_16way_state *S, const uint8_t outlen )
|
||||
memset( P->personal, 0, sizeof( P->personal ) );
|
||||
|
||||
memset( S, 0, sizeof( blake2s_16way_state ) );
|
||||
S->h[0] = m512_const1_64( 0x6A09E6676A09E667ULL );
|
||||
S->h[1] = m512_const1_64( 0xBB67AE85BB67AE85ULL );
|
||||
S->h[2] = m512_const1_64( 0x3C6EF3723C6EF372ULL );
|
||||
S->h[3] = m512_const1_64( 0xA54FF53AA54FF53AULL );
|
||||
S->h[4] = m512_const1_64( 0x510E527F510E527FULL );
|
||||
S->h[5] = m512_const1_64( 0x9B05688C9B05688CULL );
|
||||
S->h[6] = m512_const1_64( 0x1F83D9AB1F83D9ABULL );
|
||||
S->h[7] = m512_const1_64( 0x5BE0CD195BE0CD19ULL );
|
||||
S->h[0] = _mm512_set1_epi64( 0x6A09E6676A09E667ULL );
|
||||
S->h[1] = _mm512_set1_epi64( 0xBB67AE85BB67AE85ULL );
|
||||
S->h[2] = _mm512_set1_epi64( 0x3C6EF3723C6EF372ULL );
|
||||
S->h[3] = _mm512_set1_epi64( 0xA54FF53AA54FF53AULL );
|
||||
S->h[4] = _mm512_set1_epi64( 0x510E527F510E527FULL );
|
||||
S->h[5] = _mm512_set1_epi64( 0x9B05688C9B05688CULL );
|
||||
S->h[6] = _mm512_set1_epi64( 0x1F83D9AB1F83D9ABULL );
|
||||
S->h[7] = _mm512_set1_epi64( 0x5BE0CD195BE0CD19ULL );
|
||||
|
||||
uint32_t *p = ( uint32_t * )( P );
|
||||
|
||||
|
||||
@@ -64,6 +64,22 @@
|
||||
V[1] = mm256_ror_64( _mm256_xor_si256( V[1], V[2] ), 63 ); \
|
||||
}
|
||||
|
||||
// Pivot about V[1] instead of V[0] reduces latency.
|
||||
#define BLAKE2B_ROUND( R ) \
|
||||
{ \
|
||||
__m256i *V = (__m256i*)v; \
|
||||
const uint8_t *sigmaR = sigma[R]; \
|
||||
BLAKE2B_G( 0, 1, 2, 3, 4, 5, 6, 7 ); \
|
||||
V[0] = mm256_shufll_64( V[0] ); \
|
||||
V[3] = mm256_swap_128( V[3] ); \
|
||||
V[2] = mm256_shuflr_64( V[2] ); \
|
||||
BLAKE2B_G( 14, 15, 8, 9, 10, 11, 12, 13 ); \
|
||||
V[0] = mm256_shuflr_64( V[0] ); \
|
||||
V[3] = mm256_swap_128( V[3] ); \
|
||||
V[2] = mm256_shufll_64( V[2] ); \
|
||||
}
|
||||
|
||||
/*
|
||||
#define BLAKE2B_ROUND( R ) \
|
||||
{ \
|
||||
__m256i *V = (__m256i*)v; \
|
||||
@@ -77,6 +93,7 @@
|
||||
V[2] = mm256_swap_128( V[2] ); \
|
||||
V[1] = mm256_shufll_64( V[1] ); \
|
||||
}
|
||||
*/
|
||||
|
||||
#elif defined(__SSE2__)
|
||||
// always true
|
||||
|
||||
Reference in New Issue
Block a user