mirror of
https://github.com/JayDDee/cpuminer-opt.git
synced 2025-09-17 23:44:27 +00:00
v3.22.3
This commit is contained in:
@@ -598,10 +598,10 @@ void blake256_transform_le( uint32_t *H, const uint32_t *buf,
|
||||
V5 = H5; \
|
||||
V6 = H6; \
|
||||
V7 = H7; \
|
||||
V8 = m128_const1_64( 0x243F6A88243F6A88 ); \
|
||||
V9 = m128_const1_64( 0x85A308D385A308D3 ); \
|
||||
VA = m128_const1_64( 0x13198A2E13198A2E ); \
|
||||
VB = m128_const1_64( 0x0370734403707344 ); \
|
||||
V8 = _mm_set1_epi64x( 0x243F6A88243F6A88 ); \
|
||||
V9 = _mm_set1_epi64x( 0x85A308D385A308D3 ); \
|
||||
VA = _mm_set1_epi64x( 0x13198A2E13198A2E ); \
|
||||
VB = _mm_set1_epi64x( 0x0370734403707344 ); \
|
||||
VC = _mm_set1_epi32( T0 ^ 0xA4093822 ); \
|
||||
VD = _mm_set1_epi32( T0 ^ 0x299F31D0 ); \
|
||||
VE = _mm_set1_epi32( T1 ^ 0x082EFA98 ); \
|
||||
@@ -958,7 +958,6 @@ do { \
|
||||
__m256i M8, M9, MA, MB, MC, MD, ME, MF; \
|
||||
__m256i V0, V1, V2, V3, V4, V5, V6, V7; \
|
||||
__m256i V8, V9, VA, VB, VC, VD, VE, VF; \
|
||||
__m256i shuf_bswap32; \
|
||||
V0 = H0; \
|
||||
V1 = H1; \
|
||||
V2 = H2; \
|
||||
@@ -967,16 +966,16 @@ do { \
|
||||
V5 = H5; \
|
||||
V6 = H6; \
|
||||
V7 = H7; \
|
||||
V8 = m256_const1_64( 0x243F6A88243F6A88 ); \
|
||||
V9 = m256_const1_64( 0x85A308D385A308D3 ); \
|
||||
VA = m256_const1_64( 0x13198A2E13198A2E ); \
|
||||
VB = m256_const1_64( 0x0370734403707344 ); \
|
||||
V8 = _mm256_set1_epi64x( 0x243F6A88243F6A88 ); \
|
||||
V9 = _mm256_set1_epi64x( 0x85A308D385A308D3 ); \
|
||||
VA = _mm256_set1_epi64x( 0x13198A2E13198A2E ); \
|
||||
VB = _mm256_set1_epi64x( 0x0370734403707344 ); \
|
||||
VC = _mm256_set1_epi32( T0 ^ 0xA4093822 ); \
|
||||
VD = _mm256_set1_epi32( T0 ^ 0x299F31D0 ); \
|
||||
VE = _mm256_set1_epi32( T1 ^ 0x082EFA98 ); \
|
||||
VF = _mm256_set1_epi32( T1 ^ 0xEC4E6C89 ); \
|
||||
shuf_bswap32 = m256_const_64( 0x1c1d1e1f18191a1b, 0x1415161710111213, \
|
||||
0x0c0d0e0f08090a0b, 0x0405060700010203 ); \
|
||||
const __m256i shuf_bswap32 = mm256_set2_64( \
|
||||
0x0c0d0e0f08090a0b, 0x0405060700010203 ); \
|
||||
M0 = _mm256_shuffle_epi8( * buf , shuf_bswap32 ); \
|
||||
M1 = _mm256_shuffle_epi8( *(buf+ 1), shuf_bswap32 ); \
|
||||
M2 = _mm256_shuffle_epi8( *(buf+ 2), shuf_bswap32 ); \
|
||||
@@ -1034,10 +1033,10 @@ do { \
|
||||
V5 = H5; \
|
||||
V6 = H6; \
|
||||
V7 = H7; \
|
||||
V8 = m256_const1_64( 0x243F6A88243F6A88 ); \
|
||||
V9 = m256_const1_64( 0x85A308D385A308D3 ); \
|
||||
VA = m256_const1_64( 0x13198A2E13198A2E ); \
|
||||
VB = m256_const1_64( 0x0370734403707344 ); \
|
||||
V8 = _mm256_set1_epi64x( 0x243F6A88243F6A88 ); \
|
||||
V9 = _mm256_set1_epi64x( 0x85A308D385A308D3 ); \
|
||||
VA = _mm256_set1_epi64x( 0x13198A2E13198A2E ); \
|
||||
VB = _mm256_set1_epi64x( 0x0370734403707344 ); \
|
||||
VC = _mm256_set1_epi32( T0 ^ 0xA4093822 ); \
|
||||
VD = _mm256_set1_epi32( T0 ^ 0x299F31D0 ); \
|
||||
VE = _mm256_set1_epi32( T1 ^ 0x082EFA98 ); \
|
||||
@@ -1100,23 +1099,23 @@ void blake256_8way_round0_prehash_le( void *midstate, const void *midhash,
|
||||
V[ 5] = H[5];
|
||||
V[ 6] = H[6];
|
||||
V[ 7] = H[7];
|
||||
V[ 8] = m256_const1_32( CS0 );
|
||||
V[ 9] = m256_const1_32( CS1 );
|
||||
V[10] = m256_const1_32( CS2 );
|
||||
V[11] = m256_const1_32( CS3 );
|
||||
V[12] = m256_const1_32( CS4 ^ 0x280 );
|
||||
V[13] = m256_const1_32( CS5 ^ 0x280 );
|
||||
V[14] = m256_const1_32( CS6 );
|
||||
V[15] = m256_const1_32( CS7 );
|
||||
V[ 8] = _mm256_set1_epi32( CS0 );
|
||||
V[ 9] = _mm256_set1_epi32( CS1 );
|
||||
V[10] = _mm256_set1_epi32( CS2 );
|
||||
V[11] = _mm256_set1_epi32( CS3 );
|
||||
V[12] = _mm256_set1_epi32( CS4 ^ 0x280 );
|
||||
V[13] = _mm256_set1_epi32( CS5 ^ 0x280 );
|
||||
V[14] = _mm256_set1_epi32( CS6 );
|
||||
V[15] = _mm256_set1_epi32( CS7 );
|
||||
|
||||
// M[ 0:3 ] contain new message data including unique nonces in M[ 3].
|
||||
// M[ 5:12, 14 ] are always zero and not needed or used.
|
||||
// M[ 4], M[ 13], M[15] are constant and are initialized here.
|
||||
// M[ 5] is a special case, used as a cache for (M[13] ^ CSC).
|
||||
|
||||
M[ 4] = m256_const1_32( 0x80000000 );
|
||||
M[ 4] = _mm256_set1_epi32( 0x80000000 );
|
||||
M[13] = m256_one_32;
|
||||
M[15] = m256_const1_32( 80*8 );
|
||||
M[15] = _mm256_set1_epi32( 80*8 );
|
||||
|
||||
M[ 5] =_mm256_xor_si256( M[13], _mm256_set1_epi32( CSC ) );
|
||||
|
||||
@@ -1278,8 +1277,7 @@ void blake256_8way_final_rounds_le( void *final_hash, const void *midstate,
|
||||
ROUND256_8WAY_3;
|
||||
|
||||
const __m256i shuf_bswap32 =
|
||||
m256_const_64( 0x1c1d1e1f18191a1b, 0x1415161710111213,
|
||||
0x0c0d0e0f08090a0b, 0x0405060700010203 );
|
||||
mm256_set2_64( 0x0c0d0e0f08090a0b, 0x0405060700010203 );
|
||||
|
||||
H[0] = _mm256_shuffle_epi8( mm256_xor3( V8, V0, h[0] ), shuf_bswap32 );
|
||||
H[1] = _mm256_shuffle_epi8( mm256_xor3( V9, V1, h[1] ), shuf_bswap32 );
|
||||
@@ -1615,7 +1613,8 @@ do { \
|
||||
__m512i M8, M9, MA, MB, MC, MD, ME, MF; \
|
||||
__m512i V0, V1, V2, V3, V4, V5, V6, V7; \
|
||||
__m512i V8, V9, VA, VB, VC, VD, VE, VF; \
|
||||
__m512i shuf_bswap32; \
|
||||
const __m512i shuf_bswap32 = mm512_bcast_m128( _mm_set_epi64x( \
|
||||
0x0c0d0e0f08090a0b, 0x0405060700010203 ) ); \
|
||||
V0 = H0; \
|
||||
V1 = H1; \
|
||||
V2 = H2; \
|
||||
@@ -1624,18 +1623,14 @@ do { \
|
||||
V5 = H5; \
|
||||
V6 = H6; \
|
||||
V7 = H7; \
|
||||
V8 = m512_const1_64( 0x243F6A88243F6A88 ); \
|
||||
V9 = m512_const1_64( 0x85A308D385A308D3 ); \
|
||||
VA = m512_const1_64( 0x13198A2E13198A2E ); \
|
||||
VB = m512_const1_64( 0x0370734403707344 ); \
|
||||
V8 = _mm512_set1_epi64( 0x243F6A88243F6A88 ); \
|
||||
V9 = _mm512_set1_epi64( 0x85A308D385A308D3 ); \
|
||||
VA = _mm512_set1_epi64( 0x13198A2E13198A2E ); \
|
||||
VB = _mm512_set1_epi64( 0x0370734403707344 ); \
|
||||
VC = _mm512_set1_epi32( T0 ^ 0xA4093822 ); \
|
||||
VD = _mm512_set1_epi32( T0 ^ 0x299F31D0 ); \
|
||||
VE = _mm512_set1_epi32( T1 ^ 0x082EFA98 ); \
|
||||
VF = _mm512_set1_epi32( T1 ^ 0xEC4E6C89 ); \
|
||||
shuf_bswap32 = m512_const_64( 0x3c3d3e3f38393a3b, 0x3435363730313233, \
|
||||
0x2c2d2e2f28292a2b, 0x2425262720212223, \
|
||||
0x1c1d1e1f18191a1b, 0x1415161710111213, \
|
||||
0x0c0d0e0f08090a0b, 0x0405060700010203 ); \
|
||||
M0 = _mm512_shuffle_epi8( * buf , shuf_bswap32 ); \
|
||||
M1 = _mm512_shuffle_epi8( *(buf+ 1), shuf_bswap32 ); \
|
||||
M2 = _mm512_shuffle_epi8( *(buf+ 2), shuf_bswap32 ); \
|
||||
@@ -1693,10 +1688,10 @@ do { \
|
||||
V5 = H5; \
|
||||
V6 = H6; \
|
||||
V7 = H7; \
|
||||
V8 = m512_const1_64( 0x243F6A88243F6A88 ); \
|
||||
V9 = m512_const1_64( 0x85A308D385A308D3 ); \
|
||||
VA = m512_const1_64( 0x13198A2E13198A2E ); \
|
||||
VB = m512_const1_64( 0x0370734403707344 ); \
|
||||
V8 = _mm512_set1_epi64( 0x243F6A88243F6A88 ); \
|
||||
V9 = _mm512_set1_epi64( 0x85A308D385A308D3 ); \
|
||||
VA = _mm512_set1_epi64( 0x13198A2E13198A2E ); \
|
||||
VB = _mm512_set1_epi64( 0x0370734403707344 ); \
|
||||
VC = _mm512_set1_epi32( T0 ^ 0xA4093822 ); \
|
||||
VD = _mm512_set1_epi32( T0 ^ 0x299F31D0 ); \
|
||||
VE = _mm512_set1_epi32( T1 ^ 0x082EFA98 ); \
|
||||
@@ -1763,23 +1758,23 @@ void blake256_16way_round0_prehash_le( void *midstate, const void *midhash,
|
||||
V[ 5] = H[5];
|
||||
V[ 6] = H[6];
|
||||
V[ 7] = H[7];
|
||||
V[ 8] = m512_const1_32( CS0 );
|
||||
V[ 9] = m512_const1_32( CS1 );
|
||||
V[10] = m512_const1_32( CS2 );
|
||||
V[11] = m512_const1_32( CS3 );
|
||||
V[12] = m512_const1_32( CS4 ^ 0x280 );
|
||||
V[13] = m512_const1_32( CS5 ^ 0x280 );
|
||||
V[14] = m512_const1_32( CS6 );
|
||||
V[15] = m512_const1_32( CS7 );
|
||||
V[ 8] = _mm512_set1_epi32( CS0 );
|
||||
V[ 9] = _mm512_set1_epi32( CS1 );
|
||||
V[10] = _mm512_set1_epi32( CS2 );
|
||||
V[11] = _mm512_set1_epi32( CS3 );
|
||||
V[12] = _mm512_set1_epi32( CS4 ^ 0x280 );
|
||||
V[13] = _mm512_set1_epi32( CS5 ^ 0x280 );
|
||||
V[14] = _mm512_set1_epi32( CS6 );
|
||||
V[15] = _mm512_set1_epi32( CS7 );
|
||||
|
||||
// M[ 0:3 ] contain new message data including unique nonces in M[ 3].
|
||||
// M[ 5:12, 14 ] are always zero and not needed or used, except M[5] as noted.
|
||||
// M[ 4], M[ 13], M[15] are constant and are initialized here.
|
||||
// M[ 5] is a special case, used as a cache for (M[13] ^ CSC).
|
||||
|
||||
M[ 4] = m512_const1_32( 0x80000000 );
|
||||
M[ 4] = _mm512_set1_epi32( 0x80000000 );
|
||||
M[13] = m512_one_32;
|
||||
M[15] = m512_const1_32( 80*8 );
|
||||
M[15] = _mm512_set1_epi32( 80*8 );
|
||||
|
||||
M[ 5] =_mm512_xor_si512( M[13], _mm512_set1_epi32( CSC ) );
|
||||
|
||||
@@ -1956,10 +1951,8 @@ void blake256_16way_final_rounds_le( void *final_hash, const void *midstate,
|
||||
|
||||
// Byte swap final hash
|
||||
const __m512i shuf_bswap32 =
|
||||
m512_const_64( 0x3c3d3e3f38393a3b, 0x3435363730313233,
|
||||
0x2c2d2e2f28292a2b, 0x2425262720212223,
|
||||
0x1c1d1e1f18191a1b, 0x1415161710111213,
|
||||
0x0c0d0e0f08090a0b, 0x0405060700010203 );
|
||||
mm512_bcast_m128( _mm_set_epi64x(
|
||||
0x0c0d0e0f08090a0b, 0x0405060700010203 ) );
|
||||
|
||||
H[0] = _mm512_shuffle_epi8( mm512_xor3( V8, V0, h[0] ), shuf_bswap32 );
|
||||
H[1] = _mm512_shuffle_epi8( mm512_xor3( V9, V1, h[1] ), shuf_bswap32 );
|
||||
@@ -1981,14 +1974,14 @@ static void
|
||||
blake32_4way_init( blake_4way_small_context *ctx, const uint32_t *iv,
|
||||
const uint32_t *salt, int rounds )
|
||||
{
|
||||
casti_m128i( ctx->H, 0 ) = m128_const1_64( 0x6A09E6676A09E667 );
|
||||
casti_m128i( ctx->H, 1 ) = m128_const1_64( 0xBB67AE85BB67AE85 );
|
||||
casti_m128i( ctx->H, 2 ) = m128_const1_64( 0x3C6EF3723C6EF372 );
|
||||
casti_m128i( ctx->H, 3 ) = m128_const1_64( 0xA54FF53AA54FF53A );
|
||||
casti_m128i( ctx->H, 4 ) = m128_const1_64( 0x510E527F510E527F );
|
||||
casti_m128i( ctx->H, 5 ) = m128_const1_64( 0x9B05688C9B05688C );
|
||||
casti_m128i( ctx->H, 6 ) = m128_const1_64( 0x1F83D9AB1F83D9AB );
|
||||
casti_m128i( ctx->H, 7 ) = m128_const1_64( 0x5BE0CD195BE0CD19 );
|
||||
casti_m128i( ctx->H, 0 ) = _mm_set1_epi64x( 0x6A09E6676A09E667 );
|
||||
casti_m128i( ctx->H, 1 ) = _mm_set1_epi64x( 0xBB67AE85BB67AE85 );
|
||||
casti_m128i( ctx->H, 2 ) = _mm_set1_epi64x( 0x3C6EF3723C6EF372 );
|
||||
casti_m128i( ctx->H, 3 ) = _mm_set1_epi64x( 0xA54FF53AA54FF53A );
|
||||
casti_m128i( ctx->H, 4 ) = _mm_set1_epi64x( 0x510E527F510E527F );
|
||||
casti_m128i( ctx->H, 5 ) = _mm_set1_epi64x( 0x9B05688C9B05688C );
|
||||
casti_m128i( ctx->H, 6 ) = _mm_set1_epi64x( 0x1F83D9AB1F83D9AB );
|
||||
casti_m128i( ctx->H, 7 ) = _mm_set1_epi64x( 0x5BE0CD195BE0CD19 );
|
||||
ctx->T0 = ctx->T1 = 0;
|
||||
ctx->ptr = 0;
|
||||
ctx->rounds = rounds;
|
||||
@@ -2059,13 +2052,13 @@ blake32_4way_close( blake_4way_small_context *ctx, unsigned ub, unsigned n,
|
||||
else
|
||||
ctx->T0 -= 512 - bit_len;
|
||||
|
||||
buf[vptr] = m128_const1_64( 0x0000008000000080 );
|
||||
buf[vptr] = _mm_set1_epi64x( 0x0000008000000080 );
|
||||
|
||||
if ( vptr < 12 )
|
||||
{
|
||||
memset_zero_128( buf + vptr + 1, 13 - vptr );
|
||||
buf[ 13 ] = _mm_or_si128( buf[ 13 ],
|
||||
m128_const1_64( 0x0100000001000000ULL ) );
|
||||
_mm_set1_epi64x( 0x0100000001000000ULL ) );
|
||||
buf[ 14 ] = _mm_set1_epi32( bswap_32( th ) );
|
||||
buf[ 15 ] = _mm_set1_epi32( bswap_32( tl ) );
|
||||
blake32_4way( ctx, buf + vptr, 64 - ptr );
|
||||
@@ -2078,7 +2071,7 @@ blake32_4way_close( blake_4way_small_context *ctx, unsigned ub, unsigned n,
|
||||
ctx->T1 = 0xFFFFFFFFUL;
|
||||
memset_zero_128( buf, 56>>2 );
|
||||
buf[ 13 ] = _mm_or_si128( buf[ 13 ],
|
||||
m128_const1_64( 0x0100000001000000ULL ) );
|
||||
_mm_set1_epi64x( 0x0100000001000000ULL ) );
|
||||
buf[ 14 ] = _mm_set1_epi32( bswap_32( th ) );
|
||||
buf[ 15 ] = _mm_set1_epi32( bswap_32( tl ) );
|
||||
blake32_4way( ctx, buf, 64 );
|
||||
@@ -2097,14 +2090,14 @@ static void
|
||||
blake32_8way_init( blake_8way_small_context *sc, const sph_u32 *iv,
|
||||
const sph_u32 *salt, int rounds )
|
||||
{
|
||||
casti_m256i( sc->H, 0 ) = m256_const1_64( 0x6A09E6676A09E667 );
|
||||
casti_m256i( sc->H, 1 ) = m256_const1_64( 0xBB67AE85BB67AE85 );
|
||||
casti_m256i( sc->H, 2 ) = m256_const1_64( 0x3C6EF3723C6EF372 );
|
||||
casti_m256i( sc->H, 3 ) = m256_const1_64( 0xA54FF53AA54FF53A );
|
||||
casti_m256i( sc->H, 4 ) = m256_const1_64( 0x510E527F510E527F );
|
||||
casti_m256i( sc->H, 5 ) = m256_const1_64( 0x9B05688C9B05688C );
|
||||
casti_m256i( sc->H, 6 ) = m256_const1_64( 0x1F83D9AB1F83D9AB );
|
||||
casti_m256i( sc->H, 7 ) = m256_const1_64( 0x5BE0CD195BE0CD19 );
|
||||
casti_m256i( sc->H, 0 ) = _mm256_set1_epi64x( 0x6A09E6676A09E667 );
|
||||
casti_m256i( sc->H, 1 ) = _mm256_set1_epi64x( 0xBB67AE85BB67AE85 );
|
||||
casti_m256i( sc->H, 2 ) = _mm256_set1_epi64x( 0x3C6EF3723C6EF372 );
|
||||
casti_m256i( sc->H, 3 ) = _mm256_set1_epi64x( 0xA54FF53AA54FF53A );
|
||||
casti_m256i( sc->H, 4 ) = _mm256_set1_epi64x( 0x510E527F510E527F );
|
||||
casti_m256i( sc->H, 5 ) = _mm256_set1_epi64x( 0x9B05688C9B05688C );
|
||||
casti_m256i( sc->H, 6 ) = _mm256_set1_epi64x( 0x1F83D9AB1F83D9AB );
|
||||
casti_m256i( sc->H, 7 ) = _mm256_set1_epi64x( 0x5BE0CD195BE0CD19 );
|
||||
sc->T0 = sc->T1 = 0;
|
||||
sc->ptr = 0;
|
||||
sc->rounds = rounds;
|
||||
@@ -2163,7 +2156,7 @@ blake32_8way_close( blake_8way_small_context *sc, unsigned ub, unsigned n,
|
||||
|
||||
ptr = sc->ptr;
|
||||
bit_len = ((unsigned)ptr << 3);
|
||||
buf[ptr>>2] = m256_const1_64( 0x0000008000000080ULL );
|
||||
buf[ptr>>2] = _mm256_set1_epi64x( 0x0000008000000080ULL );
|
||||
tl = sc->T0 + bit_len;
|
||||
th = sc->T1;
|
||||
|
||||
@@ -2185,7 +2178,7 @@ blake32_8way_close( blake_8way_small_context *sc, unsigned ub, unsigned n,
|
||||
memset_zero_256( buf + (ptr>>2) + 1, (52 - ptr) >> 2 );
|
||||
if ( out_size_w32 == 8 )
|
||||
buf[52>>2] = _mm256_or_si256( buf[52>>2],
|
||||
m256_const1_64( 0x0100000001000000ULL ) );
|
||||
_mm256_set1_epi64x( 0x0100000001000000ULL ) );
|
||||
*(buf+(56>>2)) = _mm256_set1_epi32( bswap_32( th ) );
|
||||
*(buf+(60>>2)) = _mm256_set1_epi32( bswap_32( tl ) );
|
||||
blake32_8way( sc, buf + (ptr>>2), 64 - ptr );
|
||||
@@ -2198,7 +2191,7 @@ blake32_8way_close( blake_8way_small_context *sc, unsigned ub, unsigned n,
|
||||
sc->T1 = SPH_C32(0xFFFFFFFFUL);
|
||||
memset_zero_256( buf, 56>>2 );
|
||||
if ( out_size_w32 == 8 )
|
||||
buf[52>>2] = m256_const1_64( 0x0100000001000000ULL );
|
||||
buf[52>>2] = _mm256_set1_epi64x( 0x0100000001000000ULL );
|
||||
*(buf+(56>>2)) = _mm256_set1_epi32( bswap_32( th ) );
|
||||
*(buf+(60>>2)) = _mm256_set1_epi32( bswap_32( tl ) );
|
||||
blake32_8way( sc, buf, 64 );
|
||||
@@ -2259,7 +2252,7 @@ blake32_8way_close_le( blake_8way_small_context *sc, unsigned ub, unsigned n,
|
||||
|
||||
ptr = sc->ptr;
|
||||
bit_len = ((unsigned)ptr << 3);
|
||||
buf[ptr>>2] = m256_const1_32( 0x80000000 );
|
||||
buf[ptr>>2] = _mm256_set1_epi32( 0x80000000 );
|
||||
tl = sc->T0 + bit_len;
|
||||
th = sc->T1;
|
||||
|
||||
@@ -2312,14 +2305,14 @@ static void
|
||||
blake32_16way_init( blake_16way_small_context *sc, const sph_u32 *iv,
|
||||
const sph_u32 *salt, int rounds )
|
||||
{
|
||||
casti_m512i( sc->H, 0 ) = m512_const1_64( 0x6A09E6676A09E667 );
|
||||
casti_m512i( sc->H, 1 ) = m512_const1_64( 0xBB67AE85BB67AE85 );
|
||||
casti_m512i( sc->H, 2 ) = m512_const1_64( 0x3C6EF3723C6EF372 );
|
||||
casti_m512i( sc->H, 3 ) = m512_const1_64( 0xA54FF53AA54FF53A );
|
||||
casti_m512i( sc->H, 4 ) = m512_const1_64( 0x510E527F510E527F );
|
||||
casti_m512i( sc->H, 5 ) = m512_const1_64( 0x9B05688C9B05688C );
|
||||
casti_m512i( sc->H, 6 ) = m512_const1_64( 0x1F83D9AB1F83D9AB );
|
||||
casti_m512i( sc->H, 7 ) = m512_const1_64( 0x5BE0CD195BE0CD19 );
|
||||
casti_m512i( sc->H, 0 ) = _mm512_set1_epi64( 0x6A09E6676A09E667 );
|
||||
casti_m512i( sc->H, 1 ) = _mm512_set1_epi64( 0xBB67AE85BB67AE85 );
|
||||
casti_m512i( sc->H, 2 ) = _mm512_set1_epi64( 0x3C6EF3723C6EF372 );
|
||||
casti_m512i( sc->H, 3 ) = _mm512_set1_epi64( 0xA54FF53AA54FF53A );
|
||||
casti_m512i( sc->H, 4 ) = _mm512_set1_epi64( 0x510E527F510E527F );
|
||||
casti_m512i( sc->H, 5 ) = _mm512_set1_epi64( 0x9B05688C9B05688C );
|
||||
casti_m512i( sc->H, 6 ) = _mm512_set1_epi64( 0x1F83D9AB1F83D9AB );
|
||||
casti_m512i( sc->H, 7 ) = _mm512_set1_epi64( 0x5BE0CD195BE0CD19 );
|
||||
sc->T0 = sc->T1 = 0;
|
||||
sc->ptr = 0;
|
||||
sc->rounds = rounds;
|
||||
@@ -2376,7 +2369,7 @@ blake32_16way_close( blake_16way_small_context *sc, unsigned ub, unsigned n,
|
||||
|
||||
ptr = sc->ptr;
|
||||
bit_len = ((unsigned)ptr << 3);
|
||||
buf[ptr>>2] = m512_const1_64( 0x0000008000000080ULL );
|
||||
buf[ptr>>2] = _mm512_set1_epi64( 0x0000008000000080ULL );
|
||||
tl = sc->T0 + bit_len;
|
||||
th = sc->T1;
|
||||
|
||||
@@ -2398,7 +2391,7 @@ blake32_16way_close( blake_16way_small_context *sc, unsigned ub, unsigned n,
|
||||
memset_zero_512( buf + (ptr>>2) + 1, (52 - ptr) >> 2 );
|
||||
if ( out_size_w32 == 8 )
|
||||
buf[52>>2] = _mm512_or_si512( buf[52>>2],
|
||||
m512_const1_64( 0x0100000001000000ULL ) );
|
||||
_mm512_set1_epi64( 0x0100000001000000ULL ) );
|
||||
buf[56>>2] = _mm512_set1_epi32( bswap_32( th ) );
|
||||
buf[60>>2] = _mm512_set1_epi32( bswap_32( tl ) );
|
||||
blake32_16way( sc, buf + (ptr>>2), 64 - ptr );
|
||||
@@ -2411,7 +2404,7 @@ blake32_16way_close( blake_16way_small_context *sc, unsigned ub, unsigned n,
|
||||
sc->T1 = 0xFFFFFFFFUL;
|
||||
memset_zero_512( buf, 56>>2 );
|
||||
if ( out_size_w32 == 8 )
|
||||
buf[52>>2] = m512_const1_64( 0x0100000001000000ULL );
|
||||
buf[52>>2] = _mm512_set1_epi64( 0x0100000001000000ULL );
|
||||
buf[56>>2] = _mm512_set1_epi32( bswap_32( th ) );
|
||||
buf[60>>2] = _mm512_set1_epi32( bswap_32( tl ) );
|
||||
blake32_16way( sc, buf, 64 );
|
||||
@@ -2473,7 +2466,7 @@ blake32_16way_close_le( blake_16way_small_context *sc, unsigned ub, unsigned n,
|
||||
|
||||
ptr = sc->ptr;
|
||||
bit_len = ((unsigned)ptr << 3);
|
||||
buf[ptr>>2] = m512_const1_32( 0x80000000 );
|
||||
buf[ptr>>2] = _mm512_set1_epi32( 0x80000000 );
|
||||
tl = sc->T0 + bit_len;
|
||||
th = sc->T1;
|
||||
|
||||
|
||||
@@ -350,7 +350,6 @@ static const sph_u64 CB[16] = {
|
||||
__m512i M8, M9, MA, MB, MC, MD, ME, MF; \
|
||||
__m512i V0, V1, V2, V3, V4, V5, V6, V7; \
|
||||
__m512i V8, V9, VA, VB, VC, VD, VE, VF; \
|
||||
__m512i shuf_bswap64; \
|
||||
V0 = H0; \
|
||||
V1 = H1; \
|
||||
V2 = H2; \
|
||||
@@ -359,18 +358,16 @@ static const sph_u64 CB[16] = {
|
||||
V5 = H5; \
|
||||
V6 = H6; \
|
||||
V7 = H7; \
|
||||
V8 = m512_const1_64( CB0 ); \
|
||||
V9 = m512_const1_64( CB1 ); \
|
||||
VA = m512_const1_64( CB2 ); \
|
||||
VB = m512_const1_64( CB3 ); \
|
||||
V8 = _mm512_set1_epi64( CB0 ); \
|
||||
V9 = _mm512_set1_epi64( CB1 ); \
|
||||
VA = _mm512_set1_epi64( CB2 ); \
|
||||
VB = _mm512_set1_epi64( CB3 ); \
|
||||
VC = _mm512_set1_epi64( T0 ^ CB4 ); \
|
||||
VD = _mm512_set1_epi64( T0 ^ CB5 ); \
|
||||
VE = _mm512_set1_epi64( T1 ^ CB6 ); \
|
||||
VF = _mm512_set1_epi64( T1 ^ CB7 ); \
|
||||
shuf_bswap64 = m512_const_64( 0x38393a3b3c3d3e3f, 0x3031323334353637, \
|
||||
0x28292a2b2c2d2e2f, 0x2021222324252627, \
|
||||
0x18191a1b1c1d1e1f, 0x1011121314151617, \
|
||||
0x08090a0b0c0d0e0f, 0x0001020304050607 ); \
|
||||
const __m512i shuf_bswap64 = mm512_bcast_m128( _mm_set_epi64x( \
|
||||
0x08090a0b0c0d0e0f, 0x0001020304050607 ) ); \
|
||||
M0 = _mm512_shuffle_epi8( *(buf+ 0), shuf_bswap64 ); \
|
||||
M1 = _mm512_shuffle_epi8( *(buf+ 1), shuf_bswap64 ); \
|
||||
M2 = _mm512_shuffle_epi8( *(buf+ 2), shuf_bswap64 ); \
|
||||
@@ -419,7 +416,6 @@ void blake512_8way_compress( blake_8way_big_context *sc )
|
||||
__m512i M8, M9, MA, MB, MC, MD, ME, MF;
|
||||
__m512i V0, V1, V2, V3, V4, V5, V6, V7;
|
||||
__m512i V8, V9, VA, VB, VC, VD, VE, VF;
|
||||
__m512i shuf_bswap64;
|
||||
|
||||
V0 = sc->H[0];
|
||||
V1 = sc->H[1];
|
||||
@@ -429,19 +425,17 @@ void blake512_8way_compress( blake_8way_big_context *sc )
|
||||
V5 = sc->H[5];
|
||||
V6 = sc->H[6];
|
||||
V7 = sc->H[7];
|
||||
V8 = m512_const1_64( CB0 );
|
||||
V9 = m512_const1_64( CB1 );
|
||||
VA = m512_const1_64( CB2 );
|
||||
VB = m512_const1_64( CB3 );
|
||||
V8 = _mm512_set1_epi64( CB0 );
|
||||
V9 = _mm512_set1_epi64( CB1 );
|
||||
VA = _mm512_set1_epi64( CB2 );
|
||||
VB = _mm512_set1_epi64( CB3 );
|
||||
VC = _mm512_set1_epi64( sc->T0 ^ CB4 );
|
||||
VD = _mm512_set1_epi64( sc->T0 ^ CB5 );
|
||||
VE = _mm512_set1_epi64( sc->T1 ^ CB6 );
|
||||
VF = _mm512_set1_epi64( sc->T1 ^ CB7 );
|
||||
|
||||
shuf_bswap64 = m512_const_64( 0x38393a3b3c3d3e3f, 0x3031323334353637,
|
||||
0x28292a2b2c2d2e2f, 0x2021222324252627,
|
||||
0x18191a1b1c1d1e1f, 0x1011121314151617,
|
||||
0x08090a0b0c0d0e0f, 0x0001020304050607 );
|
||||
const __m512i shuf_bswap64 = mm512_bcast_m128( _mm_set_epi64x(
|
||||
0x08090a0b0c0d0e0f, 0x0001020304050607 ) );
|
||||
|
||||
M0 = _mm512_shuffle_epi8( sc->buf[ 0], shuf_bswap64 );
|
||||
M1 = _mm512_shuffle_epi8( sc->buf[ 1], shuf_bswap64 );
|
||||
@@ -503,10 +497,10 @@ void blake512_8way_compress_le( blake_8way_big_context *sc )
|
||||
V5 = sc->H[5];
|
||||
V6 = sc->H[6];
|
||||
V7 = sc->H[7];
|
||||
V8 = m512_const1_64( CB0 );
|
||||
V9 = m512_const1_64( CB1 );
|
||||
VA = m512_const1_64( CB2 );
|
||||
VB = m512_const1_64( CB3 );
|
||||
V8 = _mm512_set1_epi64( CB0 );
|
||||
V9 = _mm512_set1_epi64( CB1 );
|
||||
VA = _mm512_set1_epi64( CB2 );
|
||||
VB = _mm512_set1_epi64( CB3 );
|
||||
VC = _mm512_set1_epi64( sc->T0 ^ CB4 );
|
||||
VD = _mm512_set1_epi64( sc->T0 ^ CB5 );
|
||||
VE = _mm512_set1_epi64( sc->T1 ^ CB6 );
|
||||
@@ -565,23 +559,23 @@ void blake512_8way_prehash_le( blake_8way_big_context *sc, __m512i *midstate,
|
||||
__m512i V8, V9, VA, VB, VC, VD, VE, VF;
|
||||
|
||||
// initial hash
|
||||
casti_m512i( sc->H, 0 ) = m512_const1_64( 0x6A09E667F3BCC908 );
|
||||
casti_m512i( sc->H, 1 ) = m512_const1_64( 0xBB67AE8584CAA73B );
|
||||
casti_m512i( sc->H, 2 ) = m512_const1_64( 0x3C6EF372FE94F82B );
|
||||
casti_m512i( sc->H, 3 ) = m512_const1_64( 0xA54FF53A5F1D36F1 );
|
||||
casti_m512i( sc->H, 4 ) = m512_const1_64( 0x510E527FADE682D1 );
|
||||
casti_m512i( sc->H, 5 ) = m512_const1_64( 0x9B05688C2B3E6C1F );
|
||||
casti_m512i( sc->H, 6 ) = m512_const1_64( 0x1F83D9ABFB41BD6B );
|
||||
casti_m512i( sc->H, 7 ) = m512_const1_64( 0x5BE0CD19137E2179 );
|
||||
casti_m512i( sc->H, 0 ) = _mm512_set1_epi64( 0x6A09E667F3BCC908 );
|
||||
casti_m512i( sc->H, 1 ) = _mm512_set1_epi64( 0xBB67AE8584CAA73B );
|
||||
casti_m512i( sc->H, 2 ) = _mm512_set1_epi64( 0x3C6EF372FE94F82B );
|
||||
casti_m512i( sc->H, 3 ) = _mm512_set1_epi64( 0xA54FF53A5F1D36F1 );
|
||||
casti_m512i( sc->H, 4 ) = _mm512_set1_epi64( 0x510E527FADE682D1 );
|
||||
casti_m512i( sc->H, 5 ) = _mm512_set1_epi64( 0x9B05688C2B3E6C1F );
|
||||
casti_m512i( sc->H, 6 ) = _mm512_set1_epi64( 0x1F83D9ABFB41BD6B );
|
||||
casti_m512i( sc->H, 7 ) = _mm512_set1_epi64( 0x5BE0CD19137E2179 );
|
||||
|
||||
// fill buffer
|
||||
memcpy_512( sc->buf, (__m512i*)data, 80>>3 );
|
||||
sc->buf[10] = m512_const1_64( 0x8000000000000000ULL );
|
||||
sc->buf[10] = _mm512_set1_epi64( 0x8000000000000000ULL );
|
||||
sc->buf[11] =
|
||||
sc->buf[12] = m512_zero;
|
||||
sc->buf[13] = m512_one_64;
|
||||
sc->buf[14] = m512_zero;
|
||||
sc->buf[15] = m512_const1_64( 80*8 );
|
||||
sc->buf[15] = _mm512_set1_epi64( 80*8 );
|
||||
|
||||
// build working variables
|
||||
V0 = sc->H[0];
|
||||
@@ -592,10 +586,10 @@ void blake512_8way_prehash_le( blake_8way_big_context *sc, __m512i *midstate,
|
||||
V5 = sc->H[5];
|
||||
V6 = sc->H[6];
|
||||
V7 = sc->H[7];
|
||||
V8 = m512_const1_64( CB0 );
|
||||
V9 = m512_const1_64( CB1 );
|
||||
VA = m512_const1_64( CB2 );
|
||||
VB = m512_const1_64( CB3 );
|
||||
V8 = _mm512_set1_epi64( CB0 );
|
||||
V9 = _mm512_set1_epi64( CB1 );
|
||||
VA = _mm512_set1_epi64( CB2 );
|
||||
VB = _mm512_set1_epi64( CB3 );
|
||||
VC = _mm512_set1_epi64( CB4 ^ 0x280ULL );
|
||||
VD = _mm512_set1_epi64( CB5 ^ 0x280ULL );
|
||||
VE = _mm512_set1_epi64( CB6 );
|
||||
@@ -790,14 +784,14 @@ void blake512_8way_final_le( blake_8way_big_context *sc, void *hash,
|
||||
|
||||
void blake512_8way_init( blake_8way_big_context *sc )
|
||||
{
|
||||
casti_m512i( sc->H, 0 ) = m512_const1_64( 0x6A09E667F3BCC908 );
|
||||
casti_m512i( sc->H, 1 ) = m512_const1_64( 0xBB67AE8584CAA73B );
|
||||
casti_m512i( sc->H, 2 ) = m512_const1_64( 0x3C6EF372FE94F82B );
|
||||
casti_m512i( sc->H, 3 ) = m512_const1_64( 0xA54FF53A5F1D36F1 );
|
||||
casti_m512i( sc->H, 4 ) = m512_const1_64( 0x510E527FADE682D1 );
|
||||
casti_m512i( sc->H, 5 ) = m512_const1_64( 0x9B05688C2B3E6C1F );
|
||||
casti_m512i( sc->H, 6 ) = m512_const1_64( 0x1F83D9ABFB41BD6B );
|
||||
casti_m512i( sc->H, 7 ) = m512_const1_64( 0x5BE0CD19137E2179 );
|
||||
casti_m512i( sc->H, 0 ) = _mm512_set1_epi64( 0x6A09E667F3BCC908 );
|
||||
casti_m512i( sc->H, 1 ) = _mm512_set1_epi64( 0xBB67AE8584CAA73B );
|
||||
casti_m512i( sc->H, 2 ) = _mm512_set1_epi64( 0x3C6EF372FE94F82B );
|
||||
casti_m512i( sc->H, 3 ) = _mm512_set1_epi64( 0xA54FF53A5F1D36F1 );
|
||||
casti_m512i( sc->H, 4 ) = _mm512_set1_epi64( 0x510E527FADE682D1 );
|
||||
casti_m512i( sc->H, 5 ) = _mm512_set1_epi64( 0x9B05688C2B3E6C1F );
|
||||
casti_m512i( sc->H, 6 ) = _mm512_set1_epi64( 0x1F83D9ABFB41BD6B );
|
||||
casti_m512i( sc->H, 7 ) = _mm512_set1_epi64( 0x5BE0CD19137E2179 );
|
||||
|
||||
sc->T0 = sc->T1 = 0;
|
||||
sc->ptr = 0;
|
||||
@@ -861,7 +855,7 @@ blake64_8way_close( blake_8way_big_context *sc, void *dst )
|
||||
|
||||
ptr = sc->ptr;
|
||||
bit_len = ((unsigned)ptr << 3);
|
||||
buf[ptr>>3] = m512_const1_64( 0x80 );
|
||||
buf[ptr>>3] = _mm512_set1_epi64( 0x80 );
|
||||
tl = sc->T0 + bit_len;
|
||||
th = sc->T1;
|
||||
if (ptr == 0 )
|
||||
@@ -882,9 +876,9 @@ blake64_8way_close( blake_8way_big_context *sc, void *dst )
|
||||
{
|
||||
memset_zero_512( buf + (ptr>>3) + 1, (104-ptr) >> 3 );
|
||||
buf[104>>3] = _mm512_or_si512( buf[104>>3],
|
||||
m512_const1_64( 0x0100000000000000ULL ) );
|
||||
buf[112>>3] = m512_const1_64( bswap_64( th ) );
|
||||
buf[120>>3] = m512_const1_64( bswap_64( tl ) );
|
||||
_mm512_set1_epi64( 0x0100000000000000ULL ) );
|
||||
buf[112>>3] = _mm512_set1_epi64( bswap_64( th ) );
|
||||
buf[120>>3] = _mm512_set1_epi64( bswap_64( tl ) );
|
||||
|
||||
blake64_8way( sc, buf + (ptr>>3), 128 - ptr );
|
||||
}
|
||||
@@ -896,9 +890,9 @@ blake64_8way_close( blake_8way_big_context *sc, void *dst )
|
||||
sc->T0 = 0xFFFFFFFFFFFFFC00ULL;
|
||||
sc->T1 = 0xFFFFFFFFFFFFFFFFULL;
|
||||
memset_zero_512( buf, 112>>3 );
|
||||
buf[104>>3] = m512_const1_64( 0x0100000000000000ULL );
|
||||
buf[112>>3] = m512_const1_64( bswap_64( th ) );
|
||||
buf[120>>3] = m512_const1_64( bswap_64( tl ) );
|
||||
buf[104>>3] = _mm512_set1_epi64( 0x0100000000000000ULL );
|
||||
buf[112>>3] = _mm512_set1_epi64( bswap_64( th ) );
|
||||
buf[120>>3] = _mm512_set1_epi64( bswap_64( tl ) );
|
||||
|
||||
blake64_8way( sc, buf, 128 );
|
||||
}
|
||||
@@ -912,14 +906,14 @@ void blake512_8way_full( blake_8way_big_context *sc, void * dst,
|
||||
|
||||
// init
|
||||
|
||||
casti_m512i( sc->H, 0 ) = m512_const1_64( 0x6A09E667F3BCC908 );
|
||||
casti_m512i( sc->H, 1 ) = m512_const1_64( 0xBB67AE8584CAA73B );
|
||||
casti_m512i( sc->H, 2 ) = m512_const1_64( 0x3C6EF372FE94F82B );
|
||||
casti_m512i( sc->H, 3 ) = m512_const1_64( 0xA54FF53A5F1D36F1 );
|
||||
casti_m512i( sc->H, 4 ) = m512_const1_64( 0x510E527FADE682D1 );
|
||||
casti_m512i( sc->H, 5 ) = m512_const1_64( 0x9B05688C2B3E6C1F );
|
||||
casti_m512i( sc->H, 6 ) = m512_const1_64( 0x1F83D9ABFB41BD6B );
|
||||
casti_m512i( sc->H, 7 ) = m512_const1_64( 0x5BE0CD19137E2179 );
|
||||
casti_m512i( sc->H, 0 ) = _mm512_set1_epi64( 0x6A09E667F3BCC908 );
|
||||
casti_m512i( sc->H, 1 ) = _mm512_set1_epi64( 0xBB67AE8584CAA73B );
|
||||
casti_m512i( sc->H, 2 ) = _mm512_set1_epi64( 0x3C6EF372FE94F82B );
|
||||
casti_m512i( sc->H, 3 ) = _mm512_set1_epi64( 0xA54FF53A5F1D36F1 );
|
||||
casti_m512i( sc->H, 4 ) = _mm512_set1_epi64( 0x510E527FADE682D1 );
|
||||
casti_m512i( sc->H, 5 ) = _mm512_set1_epi64( 0x9B05688C2B3E6C1F );
|
||||
casti_m512i( sc->H, 6 ) = _mm512_set1_epi64( 0x1F83D9ABFB41BD6B );
|
||||
casti_m512i( sc->H, 7 ) = _mm512_set1_epi64( 0x5BE0CD19137E2179 );
|
||||
|
||||
sc->T0 = sc->T1 = 0;
|
||||
sc->ptr = 0;
|
||||
@@ -943,7 +937,7 @@ void blake512_8way_full( blake_8way_big_context *sc, void * dst,
|
||||
uint64_t th, tl;
|
||||
|
||||
bit_len = sc->ptr << 3;
|
||||
sc->buf[ptr64] = m512_const1_64( 0x80 );
|
||||
sc->buf[ptr64] = _mm512_set1_epi64( 0x80 );
|
||||
tl = sc->T0 + bit_len;
|
||||
th = sc->T1;
|
||||
|
||||
@@ -961,9 +955,9 @@ void blake512_8way_full( blake_8way_big_context *sc, void * dst,
|
||||
sc->T0 -= 1024 - bit_len;
|
||||
|
||||
memset_zero_512( sc->buf + ptr64 + 1, 13 - ptr64 );
|
||||
sc->buf[13] = m512_const1_64( 0x0100000000000000ULL );
|
||||
sc->buf[14] = m512_const1_64( bswap_64( th ) );
|
||||
sc->buf[15] = m512_const1_64( bswap_64( tl ) );
|
||||
sc->buf[13] = _mm512_set1_epi64( 0x0100000000000000ULL );
|
||||
sc->buf[14] = _mm512_set1_epi64( bswap_64( th ) );
|
||||
sc->buf[15] = _mm512_set1_epi64( bswap_64( tl ) );
|
||||
|
||||
if ( ( sc->T0 = sc->T0 + 1024 ) < 1024 )
|
||||
sc->T1 = sc->T1 + 1;
|
||||
@@ -979,14 +973,14 @@ void blake512_8way_full_le( blake_8way_big_context *sc, void * dst,
|
||||
|
||||
// init
|
||||
|
||||
casti_m512i( sc->H, 0 ) = m512_const1_64( 0x6A09E667F3BCC908 );
|
||||
casti_m512i( sc->H, 1 ) = m512_const1_64( 0xBB67AE8584CAA73B );
|
||||
casti_m512i( sc->H, 2 ) = m512_const1_64( 0x3C6EF372FE94F82B );
|
||||
casti_m512i( sc->H, 3 ) = m512_const1_64( 0xA54FF53A5F1D36F1 );
|
||||
casti_m512i( sc->H, 4 ) = m512_const1_64( 0x510E527FADE682D1 );
|
||||
casti_m512i( sc->H, 5 ) = m512_const1_64( 0x9B05688C2B3E6C1F );
|
||||
casti_m512i( sc->H, 6 ) = m512_const1_64( 0x1F83D9ABFB41BD6B );
|
||||
casti_m512i( sc->H, 7 ) = m512_const1_64( 0x5BE0CD19137E2179 );
|
||||
casti_m512i( sc->H, 0 ) = _mm512_set1_epi64( 0x6A09E667F3BCC908 );
|
||||
casti_m512i( sc->H, 1 ) = _mm512_set1_epi64( 0xBB67AE8584CAA73B );
|
||||
casti_m512i( sc->H, 2 ) = _mm512_set1_epi64( 0x3C6EF372FE94F82B );
|
||||
casti_m512i( sc->H, 3 ) = _mm512_set1_epi64( 0xA54FF53A5F1D36F1 );
|
||||
casti_m512i( sc->H, 4 ) = _mm512_set1_epi64( 0x510E527FADE682D1 );
|
||||
casti_m512i( sc->H, 5 ) = _mm512_set1_epi64( 0x9B05688C2B3E6C1F );
|
||||
casti_m512i( sc->H, 6 ) = _mm512_set1_epi64( 0x1F83D9ABFB41BD6B );
|
||||
casti_m512i( sc->H, 7 ) = _mm512_set1_epi64( 0x5BE0CD19137E2179 );
|
||||
|
||||
sc->T0 = sc->T1 = 0;
|
||||
sc->ptr = 0;
|
||||
@@ -1010,7 +1004,7 @@ void blake512_8way_full_le( blake_8way_big_context *sc, void * dst,
|
||||
uint64_t th, tl;
|
||||
|
||||
bit_len = sc->ptr << 3;
|
||||
sc->buf[ptr64] = m512_const1_64( 0x8000000000000000ULL );
|
||||
sc->buf[ptr64] = _mm512_set1_epi64( 0x8000000000000000ULL );
|
||||
tl = sc->T0 + bit_len;
|
||||
th = sc->T1;
|
||||
|
||||
@@ -1029,8 +1023,8 @@ void blake512_8way_full_le( blake_8way_big_context *sc, void * dst,
|
||||
|
||||
memset_zero_512( sc->buf + ptr64 + 1, 13 - ptr64 );
|
||||
sc->buf[13] = m512_one_64;
|
||||
sc->buf[14] = m512_const1_64( th );
|
||||
sc->buf[15] = m512_const1_64( tl );
|
||||
sc->buf[14] = _mm512_set1_epi64( th );
|
||||
sc->buf[15] = _mm512_set1_epi64( tl );
|
||||
|
||||
if ( ( sc->T0 = sc->T0 + 1024 ) < 1024 )
|
||||
sc->T1 = sc->T1 + 1;
|
||||
@@ -1092,7 +1086,6 @@ blake512_8way_close(void *cc, void *dst)
|
||||
__m256i M8, M9, MA, MB, MC, MD, ME, MF; \
|
||||
__m256i V0, V1, V2, V3, V4, V5, V6, V7; \
|
||||
__m256i V8, V9, VA, VB, VC, VD, VE, VF; \
|
||||
__m256i shuf_bswap64; \
|
||||
V0 = H0; \
|
||||
V1 = H1; \
|
||||
V2 = H2; \
|
||||
@@ -1101,16 +1094,16 @@ blake512_8way_close(void *cc, void *dst)
|
||||
V5 = H5; \
|
||||
V6 = H6; \
|
||||
V7 = H7; \
|
||||
V8 = m256_const1_64( CB0 ); \
|
||||
V9 = m256_const1_64( CB1 ); \
|
||||
VA = m256_const1_64( CB2 ); \
|
||||
VB = m256_const1_64( CB3 ); \
|
||||
V8 = _mm256_set1_epi64x( CB0 ); \
|
||||
V9 = _mm256_set1_epi64x( CB1 ); \
|
||||
VA = _mm256_set1_epi64x( CB2 ); \
|
||||
VB = _mm256_set1_epi64x( CB3 ); \
|
||||
VC = _mm256_set1_epi64x( T0 ^ CB4 ); \
|
||||
VD = _mm256_set1_epi64x( T0 ^ CB5 ); \
|
||||
VE = _mm256_set1_epi64x( T1 ^ CB6 ); \
|
||||
VF = _mm256_set1_epi64x( T1 ^ CB7 ); \
|
||||
shuf_bswap64 = m256_const_64( 0x18191a1b1c1d1e1f, 0x1011121314151617, \
|
||||
0x08090a0b0c0d0e0f, 0x0001020304050607 ); \
|
||||
const __m256i shuf_bswap64 = mm256_bcast_m128( _mm_set_epi64x( \
|
||||
0x08090a0b0c0d0e0f, 0x0001020304050607 ) ); \
|
||||
M0 = _mm256_shuffle_epi8( *(buf+ 0), shuf_bswap64 ); \
|
||||
M1 = _mm256_shuffle_epi8( *(buf+ 1), shuf_bswap64 ); \
|
||||
M2 = _mm256_shuffle_epi8( *(buf+ 2), shuf_bswap64 ); \
|
||||
@@ -1160,7 +1153,6 @@ void blake512_4way_compress( blake_4way_big_context *sc )
|
||||
__m256i M8, M9, MA, MB, MC, MD, ME, MF;
|
||||
__m256i V0, V1, V2, V3, V4, V5, V6, V7;
|
||||
__m256i V8, V9, VA, VB, VC, VD, VE, VF;
|
||||
__m256i shuf_bswap64;
|
||||
|
||||
V0 = sc->H[0];
|
||||
V1 = sc->H[1];
|
||||
@@ -1170,20 +1162,20 @@ void blake512_4way_compress( blake_4way_big_context *sc )
|
||||
V5 = sc->H[5];
|
||||
V6 = sc->H[6];
|
||||
V7 = sc->H[7];
|
||||
V8 = m256_const1_64( CB0 );
|
||||
V9 = m256_const1_64( CB1 );
|
||||
VA = m256_const1_64( CB2 );
|
||||
VB = m256_const1_64( CB3 );
|
||||
V8 = _mm256_set1_epi64x( CB0 );
|
||||
V9 = _mm256_set1_epi64x( CB1 );
|
||||
VA = _mm256_set1_epi64x( CB2 );
|
||||
VB = _mm256_set1_epi64x( CB3 );
|
||||
VC = _mm256_xor_si256( _mm256_set1_epi64x( sc->T0 ),
|
||||
m256_const1_64( CB4 ) );
|
||||
_mm256_set1_epi64x( CB4 ) );
|
||||
VD = _mm256_xor_si256( _mm256_set1_epi64x( sc->T0 ),
|
||||
m256_const1_64( CB5 ) );
|
||||
_mm256_set1_epi64x( CB5 ) );
|
||||
VE = _mm256_xor_si256( _mm256_set1_epi64x( sc->T1 ),
|
||||
m256_const1_64( CB6 ) );
|
||||
_mm256_set1_epi64x( CB6 ) );
|
||||
VF = _mm256_xor_si256( _mm256_set1_epi64x( sc->T1 ),
|
||||
m256_const1_64( CB7 ) );
|
||||
shuf_bswap64 = m256_const_64( 0x18191a1b1c1d1e1f, 0x1011121314151617,
|
||||
0x08090a0b0c0d0e0f, 0x0001020304050607 );
|
||||
_mm256_set1_epi64x( CB7 ) );
|
||||
const __m256i shuf_bswap64 = mm256_bcast_m128( _mm_set_epi64x(
|
||||
0x08090a0b0c0d0e0f, 0x0001020304050607 ) );
|
||||
|
||||
M0 = _mm256_shuffle_epi8( sc->buf[ 0], shuf_bswap64 );
|
||||
M1 = _mm256_shuffle_epi8( sc->buf[ 1], shuf_bswap64 );
|
||||
@@ -1236,23 +1228,23 @@ void blake512_4way_prehash_le( blake_4way_big_context *sc, __m256i *midstate,
|
||||
__m256i V8, V9, VA, VB, VC, VD, VE, VF;
|
||||
|
||||
// initial hash
|
||||
casti_m256i( sc->H, 0 ) = m256_const1_64( 0x6A09E667F3BCC908 );
|
||||
casti_m256i( sc->H, 1 ) = m256_const1_64( 0xBB67AE8584CAA73B );
|
||||
casti_m256i( sc->H, 2 ) = m256_const1_64( 0x3C6EF372FE94F82B );
|
||||
casti_m256i( sc->H, 3 ) = m256_const1_64( 0xA54FF53A5F1D36F1 );
|
||||
casti_m256i( sc->H, 4 ) = m256_const1_64( 0x510E527FADE682D1 );
|
||||
casti_m256i( sc->H, 5 ) = m256_const1_64( 0x9B05688C2B3E6C1F );
|
||||
casti_m256i( sc->H, 6 ) = m256_const1_64( 0x1F83D9ABFB41BD6B );
|
||||
casti_m256i( sc->H, 7 ) = m256_const1_64( 0x5BE0CD19137E2179 );
|
||||
casti_m256i( sc->H, 0 ) = _mm256_set1_epi64x( 0x6A09E667F3BCC908 );
|
||||
casti_m256i( sc->H, 1 ) = _mm256_set1_epi64x( 0xBB67AE8584CAA73B );
|
||||
casti_m256i( sc->H, 2 ) = _mm256_set1_epi64x( 0x3C6EF372FE94F82B );
|
||||
casti_m256i( sc->H, 3 ) = _mm256_set1_epi64x( 0xA54FF53A5F1D36F1 );
|
||||
casti_m256i( sc->H, 4 ) = _mm256_set1_epi64x( 0x510E527FADE682D1 );
|
||||
casti_m256i( sc->H, 5 ) = _mm256_set1_epi64x( 0x9B05688C2B3E6C1F );
|
||||
casti_m256i( sc->H, 6 ) = _mm256_set1_epi64x( 0x1F83D9ABFB41BD6B );
|
||||
casti_m256i( sc->H, 7 ) = _mm256_set1_epi64x( 0x5BE0CD19137E2179 );
|
||||
|
||||
// fill buffer
|
||||
memcpy_256( sc->buf, (__m256i*)data, 80>>3 );
|
||||
sc->buf[10] = m256_const1_64( 0x8000000000000000ULL );
|
||||
sc->buf[10] = _mm256_set1_epi64x( 0x8000000000000000ULL );
|
||||
sc->buf[11] = m256_zero;
|
||||
sc->buf[12] = m256_zero;
|
||||
sc->buf[13] = m256_one_64;
|
||||
sc->buf[14] = m256_zero;
|
||||
sc->buf[15] = m256_const1_64( 80*8 );
|
||||
sc->buf[15] = _mm256_set1_epi64x( 80*8 );
|
||||
|
||||
// build working variables
|
||||
V0 = sc->H[0];
|
||||
@@ -1263,10 +1255,10 @@ void blake512_4way_prehash_le( blake_4way_big_context *sc, __m256i *midstate,
|
||||
V5 = sc->H[5];
|
||||
V6 = sc->H[6];
|
||||
V7 = sc->H[7];
|
||||
V8 = m256_const1_64( CB0 );
|
||||
V9 = m256_const1_64( CB1 );
|
||||
VA = m256_const1_64( CB2 );
|
||||
VB = m256_const1_64( CB3 );
|
||||
V8 = _mm256_set1_epi64x( CB0 );
|
||||
V9 = _mm256_set1_epi64x( CB1 );
|
||||
VA = _mm256_set1_epi64x( CB2 );
|
||||
VB = _mm256_set1_epi64x( CB3 );
|
||||
VC = _mm256_set1_epi64x( CB4 ^ 0x280ULL );
|
||||
VD = _mm256_set1_epi64x( CB5 ^ 0x280ULL );
|
||||
VE = _mm256_set1_epi64x( CB6 );
|
||||
@@ -1446,14 +1438,14 @@ void blake512_4way_final_le( blake_4way_big_context *sc, void *hash,
|
||||
|
||||
void blake512_4way_init( blake_4way_big_context *sc )
|
||||
{
|
||||
casti_m256i( sc->H, 0 ) = m256_const1_64( 0x6A09E667F3BCC908 );
|
||||
casti_m256i( sc->H, 1 ) = m256_const1_64( 0xBB67AE8584CAA73B );
|
||||
casti_m256i( sc->H, 2 ) = m256_const1_64( 0x3C6EF372FE94F82B );
|
||||
casti_m256i( sc->H, 3 ) = m256_const1_64( 0xA54FF53A5F1D36F1 );
|
||||
casti_m256i( sc->H, 4 ) = m256_const1_64( 0x510E527FADE682D1 );
|
||||
casti_m256i( sc->H, 5 ) = m256_const1_64( 0x9B05688C2B3E6C1F );
|
||||
casti_m256i( sc->H, 6 ) = m256_const1_64( 0x1F83D9ABFB41BD6B );
|
||||
casti_m256i( sc->H, 7 ) = m256_const1_64( 0x5BE0CD19137E2179 );
|
||||
casti_m256i( sc->H, 0 ) = _mm256_set1_epi64x( 0x6A09E667F3BCC908 );
|
||||
casti_m256i( sc->H, 1 ) = _mm256_set1_epi64x( 0xBB67AE8584CAA73B );
|
||||
casti_m256i( sc->H, 2 ) = _mm256_set1_epi64x( 0x3C6EF372FE94F82B );
|
||||
casti_m256i( sc->H, 3 ) = _mm256_set1_epi64x( 0xA54FF53A5F1D36F1 );
|
||||
casti_m256i( sc->H, 4 ) = _mm256_set1_epi64x( 0x510E527FADE682D1 );
|
||||
casti_m256i( sc->H, 5 ) = _mm256_set1_epi64x( 0x9B05688C2B3E6C1F );
|
||||
casti_m256i( sc->H, 6 ) = _mm256_set1_epi64x( 0x1F83D9ABFB41BD6B );
|
||||
casti_m256i( sc->H, 7 ) = _mm256_set1_epi64x( 0x5BE0CD19137E2179 );
|
||||
|
||||
sc->T0 = sc->T1 = 0;
|
||||
sc->ptr = 0;
|
||||
@@ -1513,7 +1505,7 @@ blake64_4way_close( blake_4way_big_context *sc, void *dst )
|
||||
|
||||
ptr = sc->ptr;
|
||||
bit_len = ((unsigned)ptr << 3);
|
||||
buf[ptr>>3] = m256_const1_64( 0x80 );
|
||||
buf[ptr>>3] = _mm256_set1_epi64x( 0x80 );
|
||||
tl = sc->T0 + bit_len;
|
||||
th = sc->T1;
|
||||
if (ptr == 0 )
|
||||
@@ -1535,9 +1527,9 @@ blake64_4way_close( blake_4way_big_context *sc, void *dst )
|
||||
{
|
||||
memset_zero_256( buf + (ptr>>3) + 1, (104-ptr) >> 3 );
|
||||
buf[104>>3] = _mm256_or_si256( buf[104>>3],
|
||||
m256_const1_64( 0x0100000000000000ULL ) );
|
||||
buf[112>>3] = m256_const1_64( bswap_64( th ) );
|
||||
buf[120>>3] = m256_const1_64( bswap_64( tl ) );
|
||||
_mm256_set1_epi64x( 0x0100000000000000ULL ) );
|
||||
buf[112>>3] = _mm256_set1_epi64x( bswap_64( th ) );
|
||||
buf[120>>3] = _mm256_set1_epi64x( bswap_64( tl ) );
|
||||
|
||||
blake64_4way( sc, buf + (ptr>>3), 128 - ptr );
|
||||
}
|
||||
@@ -1549,9 +1541,9 @@ blake64_4way_close( blake_4way_big_context *sc, void *dst )
|
||||
sc->T0 = SPH_C64(0xFFFFFFFFFFFFFC00ULL);
|
||||
sc->T1 = SPH_C64(0xFFFFFFFFFFFFFFFFULL);
|
||||
memset_zero_256( buf, 112>>3 );
|
||||
buf[104>>3] = m256_const1_64( 0x0100000000000000ULL );
|
||||
buf[112>>3] = m256_const1_64( bswap_64( th ) );
|
||||
buf[120>>3] = m256_const1_64( bswap_64( tl ) );
|
||||
buf[104>>3] = _mm256_set1_epi64x( 0x0100000000000000ULL );
|
||||
buf[112>>3] = _mm256_set1_epi64x( bswap_64( th ) );
|
||||
buf[120>>3] = _mm256_set1_epi64x( bswap_64( tl ) );
|
||||
|
||||
blake64_4way( sc, buf, 128 );
|
||||
}
|
||||
@@ -1565,14 +1557,14 @@ void blake512_4way_full( blake_4way_big_context *sc, void * dst,
|
||||
|
||||
// init
|
||||
|
||||
casti_m256i( sc->H, 0 ) = m256_const1_64( 0x6A09E667F3BCC908 );
|
||||
casti_m256i( sc->H, 1 ) = m256_const1_64( 0xBB67AE8584CAA73B );
|
||||
casti_m256i( sc->H, 2 ) = m256_const1_64( 0x3C6EF372FE94F82B );
|
||||
casti_m256i( sc->H, 3 ) = m256_const1_64( 0xA54FF53A5F1D36F1 );
|
||||
casti_m256i( sc->H, 4 ) = m256_const1_64( 0x510E527FADE682D1 );
|
||||
casti_m256i( sc->H, 5 ) = m256_const1_64( 0x9B05688C2B3E6C1F );
|
||||
casti_m256i( sc->H, 6 ) = m256_const1_64( 0x1F83D9ABFB41BD6B );
|
||||
casti_m256i( sc->H, 7 ) = m256_const1_64( 0x5BE0CD19137E2179 );
|
||||
casti_m256i( sc->H, 0 ) = _mm256_set1_epi64x( 0x6A09E667F3BCC908 );
|
||||
casti_m256i( sc->H, 1 ) = _mm256_set1_epi64x( 0xBB67AE8584CAA73B );
|
||||
casti_m256i( sc->H, 2 ) = _mm256_set1_epi64x( 0x3C6EF372FE94F82B );
|
||||
casti_m256i( sc->H, 3 ) = _mm256_set1_epi64x( 0xA54FF53A5F1D36F1 );
|
||||
casti_m256i( sc->H, 4 ) = _mm256_set1_epi64x( 0x510E527FADE682D1 );
|
||||
casti_m256i( sc->H, 5 ) = _mm256_set1_epi64x( 0x9B05688C2B3E6C1F );
|
||||
casti_m256i( sc->H, 6 ) = _mm256_set1_epi64x( 0x1F83D9ABFB41BD6B );
|
||||
casti_m256i( sc->H, 7 ) = _mm256_set1_epi64x( 0x5BE0CD19137E2179 );
|
||||
|
||||
sc->T0 = sc->T1 = 0;
|
||||
sc->ptr = 0;
|
||||
@@ -1596,7 +1588,7 @@ void blake512_4way_full( blake_4way_big_context *sc, void * dst,
|
||||
uint64_t th, tl;
|
||||
|
||||
bit_len = sc->ptr << 3;
|
||||
sc->buf[ptr64] = m256_const1_64( 0x80 );
|
||||
sc->buf[ptr64] = _mm256_set1_epi64x( 0x80 );
|
||||
tl = sc->T0 + bit_len;
|
||||
th = sc->T1;
|
||||
if ( sc->ptr == 0 )
|
||||
@@ -1613,9 +1605,9 @@ void blake512_4way_full( blake_4way_big_context *sc, void * dst,
|
||||
sc->T0 -= 1024 - bit_len;
|
||||
|
||||
memset_zero_256( sc->buf + ptr64 + 1, 13 - ptr64 );
|
||||
sc->buf[13] = m256_const1_64( 0x0100000000000000ULL );
|
||||
sc->buf[14] = m256_const1_64( bswap_64( th ) );
|
||||
sc->buf[15] = m256_const1_64( bswap_64( tl ) );
|
||||
sc->buf[13] = _mm256_set1_epi64x( 0x0100000000000000ULL );
|
||||
sc->buf[14] = _mm256_set1_epi64x( bswap_64( th ) );
|
||||
sc->buf[15] = _mm256_set1_epi64x( bswap_64( tl ) );
|
||||
|
||||
if ( ( sc->T0 = sc->T0 + 1024 ) < 1024 )
|
||||
sc->T1 = sc->T1 + 1;
|
||||
|
||||
@@ -221,14 +221,14 @@ int cube_4way_init( cube_4way_context *sp, int hashbitlen, int rounds,
|
||||
sp->rounds = rounds;
|
||||
sp->pos = 0;
|
||||
|
||||
h[ 0] = m512_const1_128( iv[0] );
|
||||
h[ 1] = m512_const1_128( iv[1] );
|
||||
h[ 2] = m512_const1_128( iv[2] );
|
||||
h[ 3] = m512_const1_128( iv[3] );
|
||||
h[ 4] = m512_const1_128( iv[4] );
|
||||
h[ 5] = m512_const1_128( iv[5] );
|
||||
h[ 6] = m512_const1_128( iv[6] );
|
||||
h[ 7] = m512_const1_128( iv[7] );
|
||||
h[ 0] = mm512_bcast_m128( iv[0] );
|
||||
h[ 1] = mm512_bcast_m128( iv[1] );
|
||||
h[ 2] = mm512_bcast_m128( iv[2] );
|
||||
h[ 3] = mm512_bcast_m128( iv[3] );
|
||||
h[ 4] = mm512_bcast_m128( iv[4] );
|
||||
h[ 5] = mm512_bcast_m128( iv[5] );
|
||||
h[ 6] = mm512_bcast_m128( iv[6] );
|
||||
h[ 7] = mm512_bcast_m128( iv[7] );
|
||||
|
||||
return 0;
|
||||
}
|
||||
@@ -259,11 +259,11 @@ int cube_4way_close( cube_4way_context *sp, void *output )
|
||||
|
||||
// pos is zero for 64 byte data, 1 for 80 byte data.
|
||||
sp->h[ sp->pos ] = _mm512_xor_si512( sp->h[ sp->pos ],
|
||||
m512_const2_64( 0, 0x0000000000000080 ) );
|
||||
mm512_bcast128lo_64( 0x0000000000000080 ) );
|
||||
transform_4way( sp );
|
||||
|
||||
sp->h[7] = _mm512_xor_si512( sp->h[7],
|
||||
m512_const2_64( 0x0000000100000000, 0 ) );
|
||||
mm512_bcast128hi_64( 0x0000000100000000 ) );
|
||||
|
||||
for ( i = 0; i < 10; ++i )
|
||||
transform_4way( sp );
|
||||
@@ -283,14 +283,14 @@ int cube_4way_full( cube_4way_context *sp, void *output, int hashbitlen,
|
||||
sp->rounds = 16;
|
||||
sp->pos = 0;
|
||||
|
||||
h[ 0] = m512_const1_128( iv[0] );
|
||||
h[ 1] = m512_const1_128( iv[1] );
|
||||
h[ 2] = m512_const1_128( iv[2] );
|
||||
h[ 3] = m512_const1_128( iv[3] );
|
||||
h[ 4] = m512_const1_128( iv[4] );
|
||||
h[ 5] = m512_const1_128( iv[5] );
|
||||
h[ 6] = m512_const1_128( iv[6] );
|
||||
h[ 7] = m512_const1_128( iv[7] );
|
||||
h[ 0] = mm512_bcast_m128( iv[0] );
|
||||
h[ 1] = mm512_bcast_m128( iv[1] );
|
||||
h[ 2] = mm512_bcast_m128( iv[2] );
|
||||
h[ 3] = mm512_bcast_m128( iv[3] );
|
||||
h[ 4] = mm512_bcast_m128( iv[4] );
|
||||
h[ 5] = mm512_bcast_m128( iv[5] );
|
||||
h[ 6] = mm512_bcast_m128( iv[6] );
|
||||
h[ 7] = mm512_bcast_m128( iv[7] );
|
||||
|
||||
const int len = size >> 4;
|
||||
const __m512i *in = (__m512i*)data;
|
||||
@@ -310,11 +310,11 @@ int cube_4way_full( cube_4way_context *sp, void *output, int hashbitlen,
|
||||
|
||||
// pos is zero for 64 byte data, 1 for 80 byte data.
|
||||
sp->h[ sp->pos ] = _mm512_xor_si512( sp->h[ sp->pos ],
|
||||
m512_const2_64( 0, 0x0000000000000080 ) );
|
||||
mm512_bcast128lo_64( 0x0000000000000080 ) );
|
||||
transform_4way( sp );
|
||||
|
||||
sp->h[7] = _mm512_xor_si512( sp->h[7],
|
||||
m512_const2_64( 0x0000000100000000, 0 ) );
|
||||
mm512_bcast128hi_64( 0x0000000100000000 ) );
|
||||
|
||||
for ( i = 0; i < 10; ++i )
|
||||
transform_4way( sp );
|
||||
@@ -336,14 +336,14 @@ int cube_4way_2buf_full( cube_4way_2buf_context *sp,
|
||||
sp->rounds = 16;
|
||||
sp->pos = 0;
|
||||
|
||||
h1[0] = h0[0] = m512_const1_128( iv[0] );
|
||||
h1[1] = h0[1] = m512_const1_128( iv[1] );
|
||||
h1[2] = h0[2] = m512_const1_128( iv[2] );
|
||||
h1[3] = h0[3] = m512_const1_128( iv[3] );
|
||||
h1[4] = h0[4] = m512_const1_128( iv[4] );
|
||||
h1[5] = h0[5] = m512_const1_128( iv[5] );
|
||||
h1[6] = h0[6] = m512_const1_128( iv[6] );
|
||||
h1[7] = h0[7] = m512_const1_128( iv[7] );
|
||||
h1[0] = h0[0] = mm512_bcast_m128( iv[0] );
|
||||
h1[1] = h0[1] = mm512_bcast_m128( iv[1] );
|
||||
h1[2] = h0[2] = mm512_bcast_m128( iv[2] );
|
||||
h1[3] = h0[3] = mm512_bcast_m128( iv[3] );
|
||||
h1[4] = h0[4] = mm512_bcast_m128( iv[4] );
|
||||
h1[5] = h0[5] = mm512_bcast_m128( iv[5] );
|
||||
h1[6] = h0[6] = mm512_bcast_m128( iv[6] );
|
||||
h1[7] = h0[7] = mm512_bcast_m128( iv[7] );
|
||||
|
||||
const int len = size >> 4;
|
||||
const __m512i *in0 = (__m512i*)data0;
|
||||
@@ -365,13 +365,13 @@ int cube_4way_2buf_full( cube_4way_2buf_context *sp,
|
||||
}
|
||||
|
||||
// pos is zero for 64 byte data, 1 for 80 byte data.
|
||||
__m512i tmp = m512_const2_64( 0, 0x0000000000000080 );
|
||||
__m512i tmp = mm512_bcast128lo_64( 0x0000000000000080 );
|
||||
sp->h0[ sp->pos ] = _mm512_xor_si512( sp->h0[ sp->pos ], tmp );
|
||||
sp->h1[ sp->pos ] = _mm512_xor_si512( sp->h1[ sp->pos ], tmp );
|
||||
|
||||
transform_4way_2buf( sp );
|
||||
|
||||
tmp = m512_const2_64( 0x0000000100000000, 0 );
|
||||
tmp = mm512_bcast128hi_64( 0x0000000100000000 );
|
||||
sp->h0[7] = _mm512_xor_si512( sp->h0[7], tmp );
|
||||
sp->h1[7] = _mm512_xor_si512( sp->h1[7], tmp );
|
||||
|
||||
@@ -384,7 +384,6 @@ int cube_4way_2buf_full( cube_4way_2buf_context *sp,
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
int cube_4way_update_close( cube_4way_context *sp, void *output,
|
||||
const void *data, size_t size )
|
||||
{
|
||||
@@ -406,11 +405,11 @@ int cube_4way_update_close( cube_4way_context *sp, void *output,
|
||||
|
||||
// pos is zero for 64 byte data, 1 for 80 byte data.
|
||||
sp->h[ sp->pos ] = _mm512_xor_si512( sp->h[ sp->pos ],
|
||||
m512_const2_64( 0, 0x0000000000000080 ) );
|
||||
mm512_bcast128lo_64( 0x0000000000000080 ) );
|
||||
transform_4way( sp );
|
||||
|
||||
sp->h[7] = _mm512_xor_si512( sp->h[7],
|
||||
m512_const2_64( 0x0000000100000000, 0 ) );
|
||||
mm512_bcast128hi_64( 0x0000000100000000 ) );
|
||||
|
||||
for ( i = 0; i < 10; ++i )
|
||||
transform_4way( sp );
|
||||
@@ -508,14 +507,14 @@ int cube_2way_init( cube_2way_context *sp, int hashbitlen, int rounds,
|
||||
sp->rounds = rounds;
|
||||
sp->pos = 0;
|
||||
|
||||
h[ 0] = m256_const1_128( iv[0] );
|
||||
h[ 1] = m256_const1_128( iv[1] );
|
||||
h[ 2] = m256_const1_128( iv[2] );
|
||||
h[ 3] = m256_const1_128( iv[3] );
|
||||
h[ 4] = m256_const1_128( iv[4] );
|
||||
h[ 5] = m256_const1_128( iv[5] );
|
||||
h[ 6] = m256_const1_128( iv[6] );
|
||||
h[ 7] = m256_const1_128( iv[7] );
|
||||
h[ 0] = mm256_bcast_m128( iv[0] );
|
||||
h[ 1] = mm256_bcast_m128( iv[1] );
|
||||
h[ 2] = mm256_bcast_m128( iv[2] );
|
||||
h[ 3] = mm256_bcast_m128( iv[3] );
|
||||
h[ 4] = mm256_bcast_m128( iv[4] );
|
||||
h[ 5] = mm256_bcast_m128( iv[5] );
|
||||
h[ 6] = mm256_bcast_m128( iv[6] );
|
||||
h[ 7] = mm256_bcast_m128( iv[7] );
|
||||
|
||||
return 0;
|
||||
}
|
||||
@@ -546,13 +545,14 @@ int cube_2way_close( cube_2way_context *sp, void *output )
|
||||
|
||||
// pos is zero for 64 byte data, 1 for 80 byte data.
|
||||
sp->h[ sp->pos ] = _mm256_xor_si256( sp->h[ sp->pos ],
|
||||
m256_const2_64( 0, 0x0000000000000080 ) );
|
||||
mm256_bcast128lo_64( 0x0000000000000080 ) );
|
||||
transform_2way( sp );
|
||||
|
||||
sp->h[7] = _mm256_xor_si256( sp->h[7],
|
||||
m256_const2_64( 0x0000000100000000, 0 ) );
|
||||
mm256_bcast128hi_64( 0x0000000100000000 ) );
|
||||
|
||||
for ( i = 0; i < 10; ++i ) transform_2way( sp );
|
||||
for ( i = 0; i < 10; ++i )
|
||||
transform_2way( sp );
|
||||
|
||||
memcpy( hash, sp->h, sp->hashlen<<5 );
|
||||
return 0;
|
||||
@@ -579,13 +579,14 @@ int cube_2way_update_close( cube_2way_context *sp, void *output,
|
||||
|
||||
// pos is zero for 64 byte data, 1 for 80 byte data.
|
||||
sp->h[ sp->pos ] = _mm256_xor_si256( sp->h[ sp->pos ],
|
||||
m256_const2_64( 0, 0x0000000000000080 ) );
|
||||
mm256_bcast128lo_64( 0x0000000000000080 ) );
|
||||
transform_2way( sp );
|
||||
|
||||
sp->h[7] = _mm256_xor_si256( sp->h[7],
|
||||
m256_const2_64( 0x0000000100000000, 0 ) );
|
||||
mm256_bcast128hi_64( 0x0000000100000000 ) );
|
||||
|
||||
for ( i = 0; i < 10; ++i ) transform_2way( sp );
|
||||
for ( i = 0; i < 10; ++i )
|
||||
transform_2way( sp );
|
||||
|
||||
memcpy( hash, sp->h, sp->hashlen<<5 );
|
||||
return 0;
|
||||
@@ -602,14 +603,14 @@ int cube_2way_full( cube_2way_context *sp, void *output, int hashbitlen,
|
||||
sp->rounds = 16;
|
||||
sp->pos = 0;
|
||||
|
||||
h[ 0] = m256_const1_128( iv[0] );
|
||||
h[ 1] = m256_const1_128( iv[1] );
|
||||
h[ 2] = m256_const1_128( iv[2] );
|
||||
h[ 3] = m256_const1_128( iv[3] );
|
||||
h[ 4] = m256_const1_128( iv[4] );
|
||||
h[ 5] = m256_const1_128( iv[5] );
|
||||
h[ 6] = m256_const1_128( iv[6] );
|
||||
h[ 7] = m256_const1_128( iv[7] );
|
||||
h[ 0] = mm256_bcast_m128( iv[0] );
|
||||
h[ 1] = mm256_bcast_m128( iv[1] );
|
||||
h[ 2] = mm256_bcast_m128( iv[2] );
|
||||
h[ 3] = mm256_bcast_m128( iv[3] );
|
||||
h[ 4] = mm256_bcast_m128( iv[4] );
|
||||
h[ 5] = mm256_bcast_m128( iv[5] );
|
||||
h[ 6] = mm256_bcast_m128( iv[6] );
|
||||
h[ 7] = mm256_bcast_m128( iv[7] );
|
||||
|
||||
const int len = size >> 4;
|
||||
const __m256i *in = (__m256i*)data;
|
||||
@@ -629,13 +630,14 @@ int cube_2way_full( cube_2way_context *sp, void *output, int hashbitlen,
|
||||
|
||||
// pos is zero for 64 byte data, 1 for 80 byte data.
|
||||
sp->h[ sp->pos ] = _mm256_xor_si256( sp->h[ sp->pos ],
|
||||
m256_const2_64( 0, 0x0000000000000080 ) );
|
||||
mm256_bcast128lo_64( 0x0000000000000080 ) );
|
||||
transform_2way( sp );
|
||||
|
||||
sp->h[7] = _mm256_xor_si256( sp->h[7],
|
||||
m256_const2_64( 0x0000000100000000, 0 ) );
|
||||
mm256_bcast128hi_64( 0x0000000100000000 ) );
|
||||
|
||||
for ( i = 0; i < 10; ++i ) transform_2way( sp );
|
||||
for ( i = 0; i < 10; ++i )
|
||||
transform_2way( sp );
|
||||
|
||||
memcpy( hash, sp->h, sp->hashlen<<5 );
|
||||
return 0;
|
||||
|
||||
@@ -162,9 +162,9 @@ void echo_4way_compress( echo_4way_context *ctx, const __m512i *pmsg,
|
||||
unsigned int r, b, i, j;
|
||||
__m512i t1, t2, s2, k1;
|
||||
__m512i _state[4][4], _state2[4][4], _statebackup[4][4];
|
||||
__m512i one = m512_one_128;
|
||||
__m512i mul2mask = m512_const2_64( 0, 0x00001b00 );
|
||||
__m512i lsbmask = m512_const1_32( 0x01010101 );
|
||||
const __m512i one = mm512_bcast128lo_64( 1 );
|
||||
const __m512i mul2mask = mm512_bcast128lo_64( 0x00001b00 );
|
||||
const __m512i lsbmask = _mm512_set1_epi32( 0x01010101 );
|
||||
|
||||
_state[ 0 ][ 0 ] = ctx->state[ 0 ][ 0 ];
|
||||
_state[ 0 ][ 1 ] = ctx->state[ 0 ][ 1 ];
|
||||
@@ -264,16 +264,16 @@ int echo_4way_init( echo_4way_context *ctx, int nHashSize )
|
||||
ctx->uHashSize = 256;
|
||||
ctx->uBlockLength = 192;
|
||||
ctx->uRounds = 8;
|
||||
ctx->hashsize = m512_const2_64( 0, 0x100 );
|
||||
ctx->const1536 = m512_const2_64( 0, 0x600 );
|
||||
ctx->hashsize = mm512_bcast128lo_64( 0x100 );
|
||||
ctx->const1536 = mm512_bcast128lo_64( 0x600 );
|
||||
break;
|
||||
|
||||
case 512:
|
||||
ctx->uHashSize = 512;
|
||||
ctx->uBlockLength = 128;
|
||||
ctx->uRounds = 10;
|
||||
ctx->hashsize = m512_const2_64( 0, 0x200 );
|
||||
ctx->const1536 = m512_const2_64( 0, 0x400);
|
||||
ctx->hashsize = mm512_bcast128lo_64( 0x200 );
|
||||
ctx->const1536 = mm512_bcast128lo_64( 0x400);
|
||||
break;
|
||||
|
||||
default:
|
||||
@@ -305,7 +305,7 @@ int echo_4way_update_close( echo_4way_context *state, void *hashval,
|
||||
{
|
||||
echo_4way_compress( state, data, 1 );
|
||||
state->processed_bits = 1024;
|
||||
remainingbits = m512_const2_64( 0, -1024 );
|
||||
remainingbits = mm512_bcast128lo_64( -1024 );
|
||||
vlen = 0;
|
||||
}
|
||||
else
|
||||
@@ -313,13 +313,15 @@ int echo_4way_update_close( echo_4way_context *state, void *hashval,
|
||||
vlen = databitlen / 128; // * 4 lanes / 128 bits per lane
|
||||
memcpy_512( state->buffer, data, vlen );
|
||||
state->processed_bits += (unsigned int)( databitlen );
|
||||
remainingbits = m512_const2_64( 0, (uint64_t)databitlen );
|
||||
remainingbits = mm512_bcast128lo_64( (uint64_t)databitlen );
|
||||
}
|
||||
|
||||
state->buffer[ vlen ] = m512_const2_64( 0, 0x80 );
|
||||
state->buffer[ vlen ] = mm512_bcast128lo_64( 0x80 );
|
||||
memset_zero_512( state->buffer + vlen + 1, vblen - vlen - 2 );
|
||||
state->buffer[ vblen-2 ] = m512_const2_64( (uint64_t)state->uHashSize << 48, 0 );
|
||||
state->buffer[ vblen-1 ] = m512_const2_64( 0, state->processed_bits);
|
||||
state->buffer[ vblen-2 ] =
|
||||
mm512_bcast128hi_64( (uint64_t)state->uHashSize << 48 );
|
||||
state->buffer[ vblen-1 ] =
|
||||
mm512_bcast128lo_64( state->processed_bits );
|
||||
|
||||
state->k = _mm512_add_epi64( state->k, remainingbits );
|
||||
state->k = _mm512_sub_epi64( state->k, state->const1536 );
|
||||
@@ -352,16 +354,16 @@ int echo_4way_full( echo_4way_context *ctx, void *hashval, int nHashSize,
|
||||
ctx->uHashSize = 256;
|
||||
ctx->uBlockLength = 192;
|
||||
ctx->uRounds = 8;
|
||||
ctx->hashsize = m512_const2_64( 0, 0x100 );
|
||||
ctx->const1536 = m512_const2_64( 0, 0x600 );
|
||||
ctx->hashsize = mm512_bcast128lo_64( 0x100 );
|
||||
ctx->const1536 = mm512_bcast128lo_64( 0x600 );
|
||||
break;
|
||||
|
||||
case 512:
|
||||
ctx->uHashSize = 512;
|
||||
ctx->uBlockLength = 128;
|
||||
ctx->uRounds = 10;
|
||||
ctx->hashsize = m512_const2_64( 0, 0x200 );
|
||||
ctx->const1536 = m512_const2_64( 0, 0x400 );
|
||||
ctx->hashsize = mm512_bcast128lo_64( 0x200 );
|
||||
ctx->const1536 = mm512_bcast128lo_64( 0x400 );
|
||||
break;
|
||||
|
||||
default:
|
||||
@@ -388,7 +390,7 @@ int echo_4way_full( echo_4way_context *ctx, void *hashval, int nHashSize,
|
||||
{
|
||||
echo_4way_compress( ctx, data, 1 );
|
||||
ctx->processed_bits = 1024;
|
||||
remainingbits = m512_const2_64( 0, -1024 );
|
||||
remainingbits = mm512_bcast128lo_64( -1024 );
|
||||
vlen = 0;
|
||||
}
|
||||
else
|
||||
@@ -396,14 +398,14 @@ int echo_4way_full( echo_4way_context *ctx, void *hashval, int nHashSize,
|
||||
vlen = databitlen / 128; // * 4 lanes / 128 bits per lane
|
||||
memcpy_512( ctx->buffer, data, vlen );
|
||||
ctx->processed_bits += (unsigned int)( databitlen );
|
||||
remainingbits = m512_const2_64( 0, databitlen );
|
||||
remainingbits = mm512_bcast128lo_64( databitlen );
|
||||
}
|
||||
|
||||
ctx->buffer[ vlen ] = m512_const2_64( 0, 0x80 );
|
||||
ctx->buffer[ vlen ] = mm512_bcast128lo_64( 0x80 );
|
||||
memset_zero_512( ctx->buffer + vlen + 1, vblen - vlen - 2 );
|
||||
ctx->buffer[ vblen-2 ] =
|
||||
m512_const2_64( (uint64_t)ctx->uHashSize << 48, 0 );
|
||||
ctx->buffer[ vblen-1 ] = m512_const2_64( 0, ctx->processed_bits);
|
||||
mm512_bcast128hi_64( (uint64_t)ctx->uHashSize << 48 );
|
||||
ctx->buffer[ vblen-1 ] = mm512_bcast128lo_64( ctx->processed_bits);
|
||||
|
||||
ctx->k = _mm512_add_epi64( ctx->k, remainingbits );
|
||||
ctx->k = _mm512_sub_epi64( ctx->k, ctx->const1536 );
|
||||
@@ -425,9 +427,9 @@ int echo_4way_full( echo_4way_context *ctx, void *hashval, int nHashSize,
|
||||
|
||||
// AVX2 + VAES
|
||||
|
||||
#define mul2mask_2way m256_const2_64( 0, 0x0000000000001b00 )
|
||||
#define mul2mask_2way mm256_bcast128lo_64( 0x0000000000001b00 )
|
||||
|
||||
#define lsbmask_2way m256_const1_32( 0x01010101 )
|
||||
#define lsbmask_2way _mm256_set1_epi32( 0x01010101 )
|
||||
|
||||
#define ECHO_SUBBYTES4_2WAY( state, j ) \
|
||||
state[0][j] = _mm256_aesenc_epi128( state[0][j], k1 ); \
|
||||
@@ -679,16 +681,16 @@ int echo_2way_init( echo_2way_context *ctx, int nHashSize )
|
||||
ctx->uHashSize = 256;
|
||||
ctx->uBlockLength = 192;
|
||||
ctx->uRounds = 8;
|
||||
ctx->hashsize = m256_const2_64( 0, 0x100 );
|
||||
ctx->const1536 = m256_const2_64( 0, 0x600 );
|
||||
ctx->hashsize = mm256_bcast128lo_64( 0x100 );
|
||||
ctx->const1536 = mm256_bcast128lo_64( 0x600 );
|
||||
break;
|
||||
|
||||
case 512:
|
||||
ctx->uHashSize = 512;
|
||||
ctx->uBlockLength = 128;
|
||||
ctx->uRounds = 10;
|
||||
ctx->hashsize = m256_const2_64( 0, 0x200 );
|
||||
ctx->const1536 = m256_const2_64( 0, 0x400 );
|
||||
ctx->hashsize = mm256_bcast128lo_64( 0x200 );
|
||||
ctx->const1536 = mm256_bcast128lo_64( 0x400 );
|
||||
break;
|
||||
|
||||
default:
|
||||
@@ -720,20 +722,20 @@ int echo_2way_update_close( echo_2way_context *state, void *hashval,
|
||||
{
|
||||
echo_2way_compress( state, data, 1 );
|
||||
state->processed_bits = 1024;
|
||||
remainingbits = m256_const2_64( 0, -1024 );
|
||||
remainingbits = mm256_bcast128lo_64( -1024 );
|
||||
vlen = 0;
|
||||
}
|
||||
else
|
||||
{
|
||||
memcpy_256( state->buffer, data, vlen );
|
||||
state->processed_bits += (unsigned int)( databitlen );
|
||||
remainingbits = m256_const2_64( 0, databitlen );
|
||||
remainingbits = mm256_bcast128lo_64( databitlen );
|
||||
}
|
||||
|
||||
state->buffer[ vlen ] = m256_const2_64( 0, 0x80 );
|
||||
state->buffer[ vlen ] = mm256_bcast128lo_64( 0x80 );
|
||||
memset_zero_256( state->buffer + vlen + 1, vblen - vlen - 2 );
|
||||
state->buffer[ vblen-2 ] = m256_const2_64( (uint64_t)state->uHashSize << 48, 0 );
|
||||
state->buffer[ vblen-1 ] = m256_const2_64( 0, state->processed_bits );
|
||||
state->buffer[ vblen-2 ] = mm256_bcast128hi_64( (uint64_t)state->uHashSize << 48 );
|
||||
state->buffer[ vblen-1 ] = mm256_bcast128lo_64( state->processed_bits );
|
||||
|
||||
state->k = _mm256_add_epi64( state->k, remainingbits );
|
||||
state->k = _mm256_sub_epi64( state->k, state->const1536 );
|
||||
@@ -766,16 +768,16 @@ int echo_2way_full( echo_2way_context *ctx, void *hashval, int nHashSize,
|
||||
ctx->uHashSize = 256;
|
||||
ctx->uBlockLength = 192;
|
||||
ctx->uRounds = 8;
|
||||
ctx->hashsize = m256_const2_64( 0, 0x100 );
|
||||
ctx->const1536 = m256_const2_64( 0, 0x600 );
|
||||
ctx->hashsize = mm256_bcast128lo_64( 0x100 );
|
||||
ctx->const1536 = mm256_bcast128lo_64( 0x600 );
|
||||
break;
|
||||
|
||||
case 512:
|
||||
ctx->uHashSize = 512;
|
||||
ctx->uBlockLength = 128;
|
||||
ctx->uRounds = 10;
|
||||
ctx->hashsize = m256_const2_64( 0, 0x200 );
|
||||
ctx->const1536 = m256_const2_64( 0, 0x400 );
|
||||
ctx->hashsize = mm256_bcast128lo_64( 0x200 );
|
||||
ctx->const1536 = mm256_bcast128lo_64( 0x400 );
|
||||
break;
|
||||
|
||||
default:
|
||||
@@ -798,7 +800,7 @@ int echo_2way_full( echo_2way_context *ctx, void *hashval, int nHashSize,
|
||||
{
|
||||
echo_2way_compress( ctx, data, 1 );
|
||||
ctx->processed_bits = 1024;
|
||||
remainingbits = m256_const2_64( 0, -1024 );
|
||||
remainingbits = mm256_bcast128lo_64( -1024 );
|
||||
vlen = 0;
|
||||
}
|
||||
else
|
||||
@@ -806,13 +808,13 @@ int echo_2way_full( echo_2way_context *ctx, void *hashval, int nHashSize,
|
||||
vlen = databitlen / 128; // * 4 lanes / 128 bits per lane
|
||||
memcpy_256( ctx->buffer, data, vlen );
|
||||
ctx->processed_bits += (unsigned int)( databitlen );
|
||||
remainingbits = m256_const2_64( 0, databitlen );
|
||||
remainingbits = mm256_bcast128lo_64( databitlen );
|
||||
}
|
||||
|
||||
ctx->buffer[ vlen ] = m256_const2_64( 0, 0x80 );
|
||||
ctx->buffer[ vlen ] = mm256_bcast128lo_64( 0x80 );
|
||||
memset_zero_256( ctx->buffer + vlen + 1, vblen - vlen - 2 );
|
||||
ctx->buffer[ vblen-2 ] = m256_const2_64( (uint64_t)ctx->uHashSize << 48, 0 );
|
||||
ctx->buffer[ vblen-1 ] = m256_const2_64( 0, ctx->processed_bits );
|
||||
ctx->buffer[ vblen-2 ] = mm256_bcast128hi_64( (uint64_t)ctx->uHashSize << 48 );
|
||||
ctx->buffer[ vblen-1 ] = mm256_bcast128lo_64( ctx->processed_bits );
|
||||
|
||||
ctx->k = _mm256_add_epi64( ctx->k, remainingbits );
|
||||
ctx->k = _mm256_sub_epi64( ctx->k, ctx->const1536 );
|
||||
|
||||
@@ -33,8 +33,7 @@ int groestl256_4way_init( groestl256_4way_context* ctx, uint64_t hashlen )
|
||||
}
|
||||
|
||||
// The only non-zero in the IV is len. It can be hard coded.
|
||||
ctx->chaining[ 3 ] = m512_const2_64( 0, 0x0100000000000000 );
|
||||
|
||||
ctx->chaining[ 3 ] = mm512_bcast128lo_64( 0x0100000000000000 );
|
||||
ctx->buf_ptr = 0;
|
||||
ctx->rem_ptr = 0;
|
||||
|
||||
@@ -51,9 +50,6 @@ int groestl256_4way_full( groestl256_4way_context* ctx, void* output,
|
||||
__m512i* in = (__m512i*)input;
|
||||
int i;
|
||||
|
||||
// if (ctx->chaining == NULL || ctx->buffer == NULL)
|
||||
// return 1;
|
||||
|
||||
for ( i = 0; i < SIZE256; i++ )
|
||||
{
|
||||
ctx->chaining[i] = m512_zero;
|
||||
@@ -61,7 +57,7 @@ int groestl256_4way_full( groestl256_4way_context* ctx, void* output,
|
||||
}
|
||||
|
||||
// The only non-zero in the IV is len. It can be hard coded.
|
||||
ctx->chaining[ 3 ] = m512_const2_64( 0, 0x0100000000000000 );
|
||||
ctx->chaining[ 3 ] = mm512_bcast128lo_64( 0x0100000000000000 );
|
||||
ctx->buf_ptr = 0;
|
||||
|
||||
// --- update ---
|
||||
@@ -83,18 +79,18 @@ int groestl256_4way_full( groestl256_4way_context* ctx, void* output,
|
||||
if ( i == SIZE256 - 1 )
|
||||
{
|
||||
// only 1 vector left in buffer, all padding at once
|
||||
ctx->buffer[i] = m512_const2_64( blocks << 56, 0x80 );
|
||||
ctx->buffer[i] = mm512_set2_64( blocks << 56, 0x80 );
|
||||
}
|
||||
else
|
||||
{
|
||||
// add first padding
|
||||
ctx->buffer[i] = m512_const2_64( 0, 0x80 );
|
||||
ctx->buffer[i] = mm512_bcast128lo_64( 0x80 );
|
||||
// add zero padding
|
||||
for ( i += 1; i < SIZE256 - 1; i++ )
|
||||
ctx->buffer[i] = m512_zero;
|
||||
|
||||
// add length padding, second last byte is zero unless blocks > 255
|
||||
ctx->buffer[i] = m512_const2_64( blocks << 56, 0 );
|
||||
ctx->buffer[i] = mm512_bcast128hi_64( blocks << 56 );
|
||||
}
|
||||
|
||||
// digest final padding block and do output transform
|
||||
@@ -140,18 +136,18 @@ int groestl256_4way_update_close( groestl256_4way_context* ctx, void* output,
|
||||
if ( i == SIZE256 - 1 )
|
||||
{
|
||||
// only 1 vector left in buffer, all padding at once
|
||||
ctx->buffer[i] = m512_const2_64( blocks << 56, 0x80 );
|
||||
ctx->buffer[i] = mm512_set2_64( blocks << 56, 0x80 );
|
||||
}
|
||||
else
|
||||
{
|
||||
// add first padding
|
||||
ctx->buffer[i] = m512_const2_64( 0, 0x80 );
|
||||
ctx->buffer[i] = mm512_bcast128lo_64( 0x80 );
|
||||
// add zero padding
|
||||
for ( i += 1; i < SIZE256 - 1; i++ )
|
||||
ctx->buffer[i] = m512_zero;
|
||||
|
||||
// add length padding, second last byte is zero unless blocks > 255
|
||||
ctx->buffer[i] = m512_const2_64( blocks << 56, 0 );
|
||||
ctx->buffer[i] = mm512_bcast128hi_64( blocks << 56 );
|
||||
}
|
||||
|
||||
// digest final padding block and do output transform
|
||||
@@ -186,7 +182,7 @@ int groestl256_2way_init( groestl256_2way_context* ctx, uint64_t hashlen )
|
||||
}
|
||||
|
||||
// The only non-zero in the IV is len. It can be hard coded.
|
||||
ctx->chaining[ 3 ] = m256_const2_64( 0, 0x0100000000000000 );
|
||||
ctx->chaining[ 3 ] = mm256_bcast128lo_64( 0x0100000000000000 );
|
||||
|
||||
ctx->buf_ptr = 0;
|
||||
ctx->rem_ptr = 0;
|
||||
@@ -211,7 +207,7 @@ int groestl256_2way_full( groestl256_2way_context* ctx, void* output,
|
||||
}
|
||||
|
||||
// The only non-zero in the IV is len. It can be hard coded.
|
||||
ctx->chaining[ 3 ] = m256_const2_64( 0, 0x0100000000000000 );
|
||||
ctx->chaining[ 3 ] = mm256_bcast128lo_64( 0x0100000000000000 );
|
||||
ctx->buf_ptr = 0;
|
||||
|
||||
// --- update ---
|
||||
@@ -233,18 +229,18 @@ int groestl256_2way_full( groestl256_2way_context* ctx, void* output,
|
||||
if ( i == SIZE256 - 1 )
|
||||
{
|
||||
// only 1 vector left in buffer, all padding at once
|
||||
ctx->buffer[i] = m256_const2_64( blocks << 56, 0x80 );
|
||||
ctx->buffer[i] = mm256_set2_64( blocks << 56, 0x80 );
|
||||
}
|
||||
else
|
||||
{
|
||||
// add first padding
|
||||
ctx->buffer[i] = m256_const2_64( 0, 0x80 );
|
||||
ctx->buffer[i] = mm256_bcast128lo_64( 0x80 );
|
||||
// add zero padding
|
||||
for ( i += 1; i < SIZE256 - 1; i++ )
|
||||
ctx->buffer[i] = m256_zero;
|
||||
|
||||
// add length padding, second last byte is zero unless blocks > 255
|
||||
ctx->buffer[i] = m256_const2_64( blocks << 56, 0 );
|
||||
ctx->buffer[i] = mm256_bcast128hi_64( blocks << 56 );
|
||||
}
|
||||
|
||||
// digest final padding block and do output transform
|
||||
@@ -289,23 +285,22 @@ int groestl256_2way_update_close( groestl256_2way_context* ctx, void* output,
|
||||
if ( i == SIZE256 - 1 )
|
||||
{
|
||||
// only 1 vector left in buffer, all padding at once
|
||||
ctx->buffer[i] = m256_const2_64( blocks << 56, 0x80 );
|
||||
ctx->buffer[i] = mm256_set2_64( blocks << 56, 0x80 );
|
||||
}
|
||||
else
|
||||
{
|
||||
// add first padding
|
||||
ctx->buffer[i] = m256_const2_64( 0, 0x80 );
|
||||
ctx->buffer[i] = mm256_bcast128lo_64( 0x80 );
|
||||
// add zero padding
|
||||
for ( i += 1; i < SIZE256 - 1; i++ )
|
||||
ctx->buffer[i] = m256_zero;
|
||||
|
||||
// add length padding, second last byte is zero unless blocks > 255
|
||||
ctx->buffer[i] = m256_const2_64( blocks << 56, 0 );
|
||||
ctx->buffer[i] = mm256_bcast128hi_64( blocks << 56 );
|
||||
}
|
||||
|
||||
// digest final padding block and do output transform
|
||||
TF512_2way( ctx->chaining, ctx->buffer );
|
||||
|
||||
OF512_2way( ctx->chaining );
|
||||
|
||||
// store hash result in output
|
||||
|
||||
@@ -165,7 +165,7 @@ static const __m512i SUBSH_MASK7 = { 0x090c000306080b07, 0x02050f0a0d01040e,
|
||||
\
|
||||
/* compute z_i : double x_i using temp xmm8 and 1B xmm9 */\
|
||||
/* compute w_i : add y_{i+4} */\
|
||||
b1 = m512_const1_64( 0x1b1b1b1b1b1b1b1b ); \
|
||||
b1 = _mm512_set1_epi64( 0x1b1b1b1b1b1b1b1b ); \
|
||||
MUL2( a0, b0, b1 ); \
|
||||
a0 = _mm512_xor_si512( a0, TEMP0 ); \
|
||||
MUL2( a1, b0, b1 ); \
|
||||
@@ -205,116 +205,18 @@ static const __m512i SUBSH_MASK7 = { 0x090c000306080b07, 0x02050f0a0d01040e,
|
||||
b1 = _mm512_xor_si512( b1, a4 ); \
|
||||
}/*MixBytes*/
|
||||
|
||||
|
||||
#if 0
|
||||
#define MixBytes(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\
|
||||
/* t_i = a_i + a_{i+1} */\
|
||||
b6 = a0;\
|
||||
b7 = a1;\
|
||||
a0 = _mm512_xor_si512(a0, a1);\
|
||||
b0 = a2;\
|
||||
a1 = _mm512_xor_si512(a1, a2);\
|
||||
b1 = a3;\
|
||||
a2 = _mm512_xor_si512(a2, a3);\
|
||||
b2 = a4;\
|
||||
a3 = _mm512_xor_si512(a3, a4);\
|
||||
b3 = a5;\
|
||||
a4 = _mm512_xor_si512(a4, a5);\
|
||||
b4 = a6;\
|
||||
a5 = _mm512_xor_si512(a5, a6);\
|
||||
b5 = a7;\
|
||||
a6 = _mm512_xor_si512(a6, a7);\
|
||||
a7 = _mm512_xor_si512(a7, b6);\
|
||||
\
|
||||
/* build y4 y5 y6 ... in regs xmm8, xmm9, xmm10 by adding t_i*/\
|
||||
b0 = _mm512_xor_si512(b0, a4);\
|
||||
b6 = _mm512_xor_si512(b6, a4);\
|
||||
b1 = _mm512_xor_si512(b1, a5);\
|
||||
b7 = _mm512_xor_si512(b7, a5);\
|
||||
b2 = _mm512_xor_si512(b2, a6);\
|
||||
b0 = _mm512_xor_si512(b0, a6);\
|
||||
/* spill values y_4, y_5 to memory */\
|
||||
TEMP0 = b0;\
|
||||
b3 = _mm512_xor_si512(b3, a7);\
|
||||
b1 = _mm512_xor_si512(b1, a7);\
|
||||
TEMP1 = b1;\
|
||||
b4 = _mm512_xor_si512(b4, a0);\
|
||||
b2 = _mm512_xor_si512(b2, a0);\
|
||||
/* save values t0, t1, t2 to xmm8, xmm9 and memory */\
|
||||
b0 = a0;\
|
||||
b5 = _mm512_xor_si512(b5, a1);\
|
||||
b3 = _mm512_xor_si512(b3, a1);\
|
||||
b1 = a1;\
|
||||
b6 = _mm512_xor_si512(b6, a2);\
|
||||
b4 = _mm512_xor_si512(b4, a2);\
|
||||
TEMP2 = a2;\
|
||||
b7 = _mm512_xor_si512(b7, a3);\
|
||||
b5 = _mm512_xor_si512(b5, a3);\
|
||||
\
|
||||
/* compute x_i = t_i + t_{i+3} */\
|
||||
a0 = _mm512_xor_si512(a0, a3);\
|
||||
a1 = _mm512_xor_si512(a1, a4);\
|
||||
a2 = _mm512_xor_si512(a2, a5);\
|
||||
a3 = _mm512_xor_si512(a3, a6);\
|
||||
a4 = _mm512_xor_si512(a4, a7);\
|
||||
a5 = _mm512_xor_si512(a5, b0);\
|
||||
a6 = _mm512_xor_si512(a6, b1);\
|
||||
a7 = _mm512_xor_si512(a7, TEMP2);\
|
||||
\
|
||||
/* compute z_i : double x_i using temp xmm8 and 1B xmm9 */\
|
||||
/* compute w_i : add y_{i+4} */\
|
||||
b1 = m512_const1_64( 0x1b1b1b1b1b1b1b1b );\
|
||||
MUL2(a0, b0, b1);\
|
||||
a0 = _mm512_xor_si512(a0, TEMP0);\
|
||||
MUL2(a1, b0, b1);\
|
||||
a1 = _mm512_xor_si512(a1, TEMP1);\
|
||||
MUL2(a2, b0, b1);\
|
||||
a2 = _mm512_xor_si512(a2, b2);\
|
||||
MUL2(a3, b0, b1);\
|
||||
a3 = _mm512_xor_si512(a3, b3);\
|
||||
MUL2(a4, b0, b1);\
|
||||
a4 = _mm512_xor_si512(a4, b4);\
|
||||
MUL2(a5, b0, b1);\
|
||||
a5 = _mm512_xor_si512(a5, b5);\
|
||||
MUL2(a6, b0, b1);\
|
||||
a6 = _mm512_xor_si512(a6, b6);\
|
||||
MUL2(a7, b0, b1);\
|
||||
a7 = _mm512_xor_si512(a7, b7);\
|
||||
\
|
||||
/* compute v_i : double w_i */\
|
||||
/* add to y_4 y_5 .. v3, v4, ... */\
|
||||
MUL2(a0, b0, b1);\
|
||||
b5 = _mm512_xor_si512(b5, a0);\
|
||||
MUL2(a1, b0, b1);\
|
||||
b6 = _mm512_xor_si512(b6, a1);\
|
||||
MUL2(a2, b0, b1);\
|
||||
b7 = _mm512_xor_si512(b7, a2);\
|
||||
MUL2(a5, b0, b1);\
|
||||
b2 = _mm512_xor_si512(b2, a5);\
|
||||
MUL2(a6, b0, b1);\
|
||||
b3 = _mm512_xor_si512(b3, a6);\
|
||||
MUL2(a7, b0, b1);\
|
||||
b4 = _mm512_xor_si512(b4, a7);\
|
||||
MUL2(a3, b0, b1);\
|
||||
MUL2(a4, b0, b1);\
|
||||
b0 = TEMP0;\
|
||||
b1 = TEMP1;\
|
||||
b0 = _mm512_xor_si512(b0, a3);\
|
||||
b1 = _mm512_xor_si512(b1, a4);\
|
||||
}/*MixBytes*/
|
||||
#endif
|
||||
#define MASK_NOT( a ) _mm512_mask_ternarylogic_epi64( a, 0xaa, a, a, 1 )
|
||||
|
||||
#define ROUND(i, a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\
|
||||
/* AddRoundConstant */\
|
||||
b1 = m512_const2_64( 0xffffffffffffffff, 0 ); \
|
||||
a0 = _mm512_xor_si512( a0, m512_const1_128( round_const_l0[i] ) );\
|
||||
a1 = _mm512_xor_si512( a1, b1 );\
|
||||
a2 = _mm512_xor_si512( a2, b1 );\
|
||||
a3 = _mm512_xor_si512( a3, b1 );\
|
||||
a4 = _mm512_xor_si512( a4, b1 );\
|
||||
a5 = _mm512_xor_si512( a5, b1 );\
|
||||
a6 = _mm512_xor_si512( a6, b1 );\
|
||||
a7 = _mm512_xor_si512( a7, m512_const1_128( round_const_l7[i] ) );\
|
||||
a0 = _mm512_xor_si512( a0, mm512_bcast_m128( round_const_l0[i] ) );\
|
||||
a1 = MASK_NOT( a1 ); \
|
||||
a2 = MASK_NOT( a2 ); \
|
||||
a3 = MASK_NOT( a3 ); \
|
||||
a4 = MASK_NOT( a4 ); \
|
||||
a5 = MASK_NOT( a5 ); \
|
||||
a6 = MASK_NOT( a6 ); \
|
||||
a7 = _mm512_xor_si512( a7, mm512_bcast_m128( round_const_l7[i] ) );\
|
||||
\
|
||||
/* ShiftBytes + SubBytes (interleaved) */\
|
||||
b0 = _mm512_xor_si512( b0, b0 );\
|
||||
@@ -450,7 +352,7 @@ static const __m512i SUBSH_MASK7 = { 0x090c000306080b07, 0x02050f0a0d01040e,
|
||||
* outputs: (i0-7) = (0|S)
|
||||
*/
|
||||
#define Matrix_Transpose_O_B(i0, i1, i2, i3, i4, i5, i6, i7, t0){\
|
||||
t0 = _mm512_xor_si512( t0, t0 );\
|
||||
t0 = m512_zero;\
|
||||
i1 = i0;\
|
||||
i3 = i2;\
|
||||
i5 = i4;\
|
||||
@@ -481,11 +383,11 @@ static const __m512i SUBSH_MASK7 = { 0x090c000306080b07, 0x02050f0a0d01040e,
|
||||
|
||||
void TF512_4way( __m512i* chaining, __m512i* message )
|
||||
{
|
||||
static __m512i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
|
||||
static __m512i xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15;
|
||||
static __m512i TEMP0;
|
||||
static __m512i TEMP1;
|
||||
static __m512i TEMP2;
|
||||
__m512i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
|
||||
__m512i xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15;
|
||||
__m512i TEMP0;
|
||||
__m512i TEMP1;
|
||||
__m512i TEMP2;
|
||||
|
||||
/* load message into registers xmm12 - xmm15 */
|
||||
xmm12 = message[0];
|
||||
@@ -547,11 +449,11 @@ void TF512_4way( __m512i* chaining, __m512i* message )
|
||||
|
||||
void OF512_4way( __m512i* chaining )
|
||||
{
|
||||
static __m512i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
|
||||
static __m512i xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15;
|
||||
static __m512i TEMP0;
|
||||
static __m512i TEMP1;
|
||||
static __m512i TEMP2;
|
||||
__m512i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
|
||||
__m512i xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15;
|
||||
__m512i TEMP0;
|
||||
__m512i TEMP1;
|
||||
__m512i TEMP2;
|
||||
|
||||
/* load CV into registers xmm8, xmm10, xmm12, xmm14 */
|
||||
xmm8 = chaining[0];
|
||||
@@ -696,7 +598,7 @@ static const __m256i SUBSH_MASK7_2WAY =
|
||||
\
|
||||
/* compute z_i : double x_i using temp xmm8 and 1B xmm9 */\
|
||||
/* compute w_i : add y_{i+4} */\
|
||||
b1 = m256_const1_64( 0x1b1b1b1b1b1b1b1b );\
|
||||
b1 = _mm256_set1_epi64x( 0x1b1b1b1b1b1b1b1b );\
|
||||
MUL2_2WAY(a0, b0, b1);\
|
||||
a0 = _mm256_xor_si256(a0, TEMP0);\
|
||||
MUL2_2WAY(a1, b0, b1);\
|
||||
@@ -738,15 +640,15 @@ static const __m256i SUBSH_MASK7_2WAY =
|
||||
|
||||
#define ROUND_2WAY(i, a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\
|
||||
/* AddRoundConstant */\
|
||||
b1 = m256_const2_64( 0xffffffffffffffff, 0 ); \
|
||||
a0 = _mm256_xor_si256( a0, m256_const1_128( round_const_l0[i] ) );\
|
||||
b1 = mm256_bcast_m128( mm128_mask_32( m128_neg1, 0x3 ) ); \
|
||||
a0 = _mm256_xor_si256( a0, mm256_bcast_m128( round_const_l0[i] ) );\
|
||||
a1 = _mm256_xor_si256( a1, b1 );\
|
||||
a2 = _mm256_xor_si256( a2, b1 );\
|
||||
a3 = _mm256_xor_si256( a3, b1 );\
|
||||
a4 = _mm256_xor_si256( a4, b1 );\
|
||||
a5 = _mm256_xor_si256( a5, b1 );\
|
||||
a6 = _mm256_xor_si256( a6, b1 );\
|
||||
a7 = _mm256_xor_si256( a7, m256_const1_128( round_const_l7[i] ) );\
|
||||
a7 = _mm256_xor_si256( a7, mm256_bcast_m128( round_const_l7[i] ) );\
|
||||
\
|
||||
/* ShiftBytes + SubBytes (interleaved) */\
|
||||
b0 = _mm256_xor_si256( b0, b0 );\
|
||||
@@ -850,7 +752,7 @@ static const __m256i SUBSH_MASK7_2WAY =
|
||||
}/**/
|
||||
|
||||
#define Matrix_Transpose_O_B_2way(i0, i1, i2, i3, i4, i5, i6, i7, t0){\
|
||||
t0 = _mm256_xor_si256( t0, t0 );\
|
||||
t0 = m256_zero;\
|
||||
i1 = i0;\
|
||||
i3 = i2;\
|
||||
i5 = i4;\
|
||||
@@ -874,11 +776,11 @@ static const __m256i SUBSH_MASK7_2WAY =
|
||||
|
||||
void TF512_2way( __m256i* chaining, __m256i* message )
|
||||
{
|
||||
static __m256i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
|
||||
static __m256i xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15;
|
||||
static __m256i TEMP0;
|
||||
static __m256i TEMP1;
|
||||
static __m256i TEMP2;
|
||||
__m256i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
|
||||
__m256i xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15;
|
||||
__m256i TEMP0;
|
||||
__m256i TEMP1;
|
||||
__m256i TEMP2;
|
||||
|
||||
/* load message into registers xmm12 - xmm15 */
|
||||
xmm12 = message[0];
|
||||
@@ -940,11 +842,11 @@ void TF512_2way( __m256i* chaining, __m256i* message )
|
||||
|
||||
void OF512_2way( __m256i* chaining )
|
||||
{
|
||||
static __m256i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
|
||||
static __m256i xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15;
|
||||
static __m256i TEMP0;
|
||||
static __m256i TEMP1;
|
||||
static __m256i TEMP2;
|
||||
__m256i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
|
||||
__m256i xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15;
|
||||
__m256i TEMP0;
|
||||
__m256i TEMP1;
|
||||
__m256i TEMP2;
|
||||
|
||||
/* load CV into registers xmm8, xmm10, xmm12, xmm14 */
|
||||
xmm8 = chaining[0];
|
||||
|
||||
@@ -25,8 +25,7 @@ int groestl512_4way_init( groestl512_4way_context* ctx, uint64_t hashlen )
|
||||
memset_zero_512( ctx->buffer, SIZE512 );
|
||||
|
||||
// The only non-zero in the IV is len. It can be hard coded.
|
||||
ctx->chaining[ 6 ] = m512_const2_64( 0x0200000000000000, 0 );
|
||||
|
||||
ctx->chaining[ 6 ] = mm512_bcast128hi_64( 0x0200000000000000 );
|
||||
ctx->buf_ptr = 0;
|
||||
ctx->rem_ptr = 0;
|
||||
|
||||
@@ -61,14 +60,14 @@ int groestl512_4way_update_close( groestl512_4way_context* ctx, void* output,
|
||||
if ( i == SIZE512 - 1 )
|
||||
{
|
||||
// only 1 vector left in buffer, all padding at once
|
||||
ctx->buffer[i] = m512_const2_64( blocks << 56, 0x80 );
|
||||
ctx->buffer[i] = mm512_set2_64( blocks << 56, 0x80 );
|
||||
}
|
||||
else
|
||||
{
|
||||
ctx->buffer[i] = m512_const2_64( 0, 0x80 );
|
||||
ctx->buffer[i] = mm512_bcast128lo_64( 0x80 );
|
||||
for ( i += 1; i < SIZE512 - 1; i++ )
|
||||
ctx->buffer[i] = m512_zero;
|
||||
ctx->buffer[i] = m512_const2_64( blocks << 56, 0 );
|
||||
ctx->buffer[i] = mm512_bcast128hi_64( blocks << 56 );
|
||||
}
|
||||
|
||||
TF1024_4way( ctx->chaining, ctx->buffer );
|
||||
@@ -94,7 +93,7 @@ int groestl512_4way_full( groestl512_4way_context* ctx, void* output,
|
||||
|
||||
memset_zero_512( ctx->chaining, SIZE512 );
|
||||
memset_zero_512( ctx->buffer, SIZE512 );
|
||||
ctx->chaining[ 6 ] = m512_const2_64( 0x0200000000000000, 0 );
|
||||
ctx->chaining[ 6 ] = mm512_bcast128hi_64( 0x0200000000000000 );
|
||||
ctx->buf_ptr = 0;
|
||||
|
||||
// --- update ---
|
||||
@@ -113,14 +112,14 @@ int groestl512_4way_full( groestl512_4way_context* ctx, void* output,
|
||||
if ( i == SIZE512 - 1 )
|
||||
{
|
||||
// only 1 vector left in buffer, all padding at once
|
||||
ctx->buffer[i] = m512_const2_64( blocks << 56, 0x80 );
|
||||
ctx->buffer[i] = mm512_set2_64( blocks << 56, 0x80 );
|
||||
}
|
||||
else
|
||||
{
|
||||
ctx->buffer[i] = m512_const2_64( 0, 0x80 );
|
||||
ctx->buffer[i] = mm512_bcast128lo_64( 0x80 );
|
||||
for ( i += 1; i < SIZE512 - 1; i++ )
|
||||
ctx->buffer[i] = m512_zero;
|
||||
ctx->buffer[i] = m512_const2_64( blocks << 56, 0 );
|
||||
ctx->buffer[i] = mm512_bcast128hi_64( blocks << 56 );
|
||||
}
|
||||
|
||||
TF1024_4way( ctx->chaining, ctx->buffer );
|
||||
@@ -143,7 +142,7 @@ int groestl512_2way_init( groestl512_2way_context* ctx, uint64_t hashlen )
|
||||
memset_zero_256( ctx->buffer, SIZE512 );
|
||||
|
||||
// The only non-zero in the IV is len. It can be hard coded.
|
||||
ctx->chaining[ 6 ] = m256_const2_64( 0x0200000000000000, 0 );
|
||||
ctx->chaining[ 6 ] = mm256_bcast128hi_64( 0x0200000000000000 );
|
||||
|
||||
ctx->buf_ptr = 0;
|
||||
ctx->rem_ptr = 0;
|
||||
@@ -179,14 +178,14 @@ int groestl512_2way_update_close( groestl512_2way_context* ctx, void* output,
|
||||
if ( i == SIZE512 - 1 )
|
||||
{
|
||||
// only 1 vector left in buffer, all padding at once
|
||||
ctx->buffer[i] = m256_const2_64( blocks << 56, 0x80 );
|
||||
ctx->buffer[i] = mm256_set2_64( blocks << 56, 0x80 );
|
||||
}
|
||||
else
|
||||
{
|
||||
ctx->buffer[i] = m256_const2_64( 0, 0x80 );
|
||||
ctx->buffer[i] = mm256_bcast128lo_64( 0x80 );
|
||||
for ( i += 1; i < SIZE512 - 1; i++ )
|
||||
ctx->buffer[i] = m256_zero;
|
||||
ctx->buffer[i] = m256_const2_64( blocks << 56, 0 );
|
||||
ctx->buffer[i] = mm256_bcast128hi_64( blocks << 56 );
|
||||
}
|
||||
|
||||
TF1024_2way( ctx->chaining, ctx->buffer );
|
||||
@@ -212,7 +211,7 @@ int groestl512_2way_full( groestl512_2way_context* ctx, void* output,
|
||||
|
||||
memset_zero_256( ctx->chaining, SIZE512 );
|
||||
memset_zero_256( ctx->buffer, SIZE512 );
|
||||
ctx->chaining[ 6 ] = m256_const2_64( 0x0200000000000000, 0 );
|
||||
ctx->chaining[ 6 ] = mm256_bcast128hi_64( 0x0200000000000000 );
|
||||
ctx->buf_ptr = 0;
|
||||
|
||||
// --- update ---
|
||||
@@ -231,14 +230,14 @@ int groestl512_2way_full( groestl512_2way_context* ctx, void* output,
|
||||
if ( i == SIZE512 - 1 )
|
||||
{
|
||||
// only 1 vector left in buffer, all padding at once
|
||||
ctx->buffer[i] = m256_const2_64( blocks << 56, 0x80 );
|
||||
ctx->buffer[i] = mm256_set2_64( blocks << 56, 0x80 );
|
||||
}
|
||||
else
|
||||
{
|
||||
ctx->buffer[i] = m256_const2_64( 0, 0x80 );
|
||||
ctx->buffer[i] = mm256_bcast128lo_64( 0x80 );
|
||||
for ( i += 1; i < SIZE512 - 1; i++ )
|
||||
ctx->buffer[i] = m256_zero;
|
||||
ctx->buffer[i] = m256_const2_64( blocks << 56, 0 );
|
||||
ctx->buffer[i] = mm256_bcast128hi_64( blocks << 56 );
|
||||
}
|
||||
|
||||
TF1024_2way( ctx->chaining, ctx->buffer );
|
||||
|
||||
@@ -174,7 +174,7 @@ static const __m512i SUBSH_MASK7 = { 0x06090c0f0205080b, 0x0e0104070a0d0003,
|
||||
\
|
||||
/* compute z_i : double x_i using temp xmm8 and 1B xmm9 */\
|
||||
/* compute w_i : add y_{i+4} */\
|
||||
b1 = m512_const1_64( 0x1b1b1b1b1b1b1b1b ); \
|
||||
b1 = _mm512_set1_epi64( 0x1b1b1b1b1b1b1b1b ); \
|
||||
MUL2( a0, b0, b1 ); \
|
||||
a0 = _mm512_xor_si512( a0, TEMP0 ); \
|
||||
MUL2( a1, b0, b1 ); \
|
||||
@@ -238,7 +238,7 @@ static const __m512i SUBSH_MASK7 = { 0x06090c0f0205080b, 0x0e0104070a0d0003,
|
||||
for ( round_counter = 0; round_counter < 14; round_counter += 2 ) \
|
||||
{ \
|
||||
/* AddRoundConstant P1024 */\
|
||||
xmm8 = _mm512_xor_si512( xmm8, m512_const1_128( \
|
||||
xmm8 = _mm512_xor_si512( xmm8, mm512_bcast_m128( \
|
||||
casti_m128i( round_const_p, round_counter ) ) ); \
|
||||
/* ShiftBytes P1024 + pre-AESENCLAST */\
|
||||
xmm8 = _mm512_shuffle_epi8( xmm8, SUBSH_MASK0 ); \
|
||||
@@ -253,7 +253,7 @@ static const __m512i SUBSH_MASK7 = { 0x06090c0f0205080b, 0x0e0104070a0d0003,
|
||||
SUBMIX(xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);\
|
||||
\
|
||||
/* AddRoundConstant P1024 */\
|
||||
xmm0 = _mm512_xor_si512( xmm0, m512_const1_128( \
|
||||
xmm0 = _mm512_xor_si512( xmm0, mm512_bcast_m128( \
|
||||
casti_m128i( round_const_p, round_counter+1 ) ) ); \
|
||||
/* ShiftBytes P1024 + pre-AESENCLAST */\
|
||||
xmm0 = _mm512_shuffle_epi8( xmm0, SUBSH_MASK0 );\
|
||||
@@ -282,7 +282,7 @@ static const __m512i SUBSH_MASK7 = { 0x06090c0f0205080b, 0x0e0104070a0d0003,
|
||||
xmm12 = _mm512_xor_si512( xmm12, xmm1 );\
|
||||
xmm13 = _mm512_xor_si512( xmm13, xmm1 );\
|
||||
xmm14 = _mm512_xor_si512( xmm14, xmm1 );\
|
||||
xmm15 = _mm512_xor_si512( xmm15, m512_const1_128( \
|
||||
xmm15 = _mm512_xor_si512( xmm15, mm512_bcast_m128( \
|
||||
casti_m128i( round_const_q, round_counter ) ) ); \
|
||||
/* ShiftBytes Q1024 + pre-AESENCLAST */\
|
||||
xmm8 = _mm512_shuffle_epi8( xmm8, SUBSH_MASK1 );\
|
||||
@@ -305,7 +305,7 @@ static const __m512i SUBSH_MASK7 = { 0x06090c0f0205080b, 0x0e0104070a0d0003,
|
||||
xmm4 = _mm512_xor_si512( xmm4, xmm9 );\
|
||||
xmm5 = _mm512_xor_si512( xmm5, xmm9 );\
|
||||
xmm6 = _mm512_xor_si512( xmm6, xmm9 );\
|
||||
xmm7 = _mm512_xor_si512( xmm7, m512_const1_128( \
|
||||
xmm7 = _mm512_xor_si512( xmm7, mm512_bcast_m128( \
|
||||
casti_m128i( round_const_q, round_counter+1 ) ) ); \
|
||||
/* ShiftBytes Q1024 + pre-AESENCLAST */\
|
||||
xmm0 = _mm512_shuffle_epi8( xmm0, SUBSH_MASK1 );\
|
||||
@@ -471,8 +471,8 @@ static const __m512i SUBSH_MASK7 = { 0x06090c0f0205080b, 0x0e0104070a0d0003,
|
||||
|
||||
void INIT_4way( __m512i* chaining )
|
||||
{
|
||||
static __m512i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
|
||||
static __m512i xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15;
|
||||
__m512i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
|
||||
__m512i xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15;
|
||||
|
||||
/* load IV into registers xmm8 - xmm15 */
|
||||
xmm8 = chaining[0];
|
||||
@@ -500,12 +500,12 @@ void INIT_4way( __m512i* chaining )
|
||||
|
||||
void TF1024_4way( __m512i* chaining, const __m512i* message )
|
||||
{
|
||||
static __m512i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
|
||||
static __m512i xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15;
|
||||
static __m512i QTEMP[8];
|
||||
static __m512i TEMP0;
|
||||
static __m512i TEMP1;
|
||||
static __m512i TEMP2;
|
||||
__m512i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
|
||||
__m512i xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15;
|
||||
__m512i QTEMP[8];
|
||||
__m512i TEMP0;
|
||||
__m512i TEMP1;
|
||||
__m512i TEMP2;
|
||||
|
||||
/* load message into registers xmm8 - xmm15 (Q = message) */
|
||||
xmm8 = message[0];
|
||||
@@ -606,11 +606,11 @@ void TF1024_4way( __m512i* chaining, const __m512i* message )
|
||||
|
||||
void OF1024_4way( __m512i* chaining )
|
||||
{
|
||||
static __m512i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
|
||||
static __m512i xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15;
|
||||
static __m512i TEMP0;
|
||||
static __m512i TEMP1;
|
||||
static __m512i TEMP2;
|
||||
__m512i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
|
||||
__m512i xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15;
|
||||
__m512i TEMP0;
|
||||
__m512i TEMP1;
|
||||
__m512i TEMP2;
|
||||
|
||||
/* load CV into registers xmm8 - xmm15 */
|
||||
xmm8 = chaining[0];
|
||||
@@ -758,7 +758,7 @@ static const __m256i SUBSH_MASK7_2WAY =
|
||||
\
|
||||
/* compute z_i : double x_i using temp xmm8 and 1B xmm9 */\
|
||||
/* compute w_i : add y_{i+4} */\
|
||||
b1 = m256_const1_64( 0x1b1b1b1b1b1b1b1b );\
|
||||
b1 = _mm256_set1_epi64x( 0x1b1b1b1b1b1b1b1b );\
|
||||
MUL2_2WAY(a0, b0, b1);\
|
||||
a0 = _mm256_xor_si256(a0, TEMP0);\
|
||||
MUL2_2WAY(a1, b0, b1);\
|
||||
@@ -822,7 +822,7 @@ static const __m256i SUBSH_MASK7_2WAY =
|
||||
for ( round_counter = 0; round_counter < 14; round_counter += 2 ) \
|
||||
{ \
|
||||
/* AddRoundConstant P1024 */\
|
||||
xmm8 = _mm256_xor_si256( xmm8, m256_const1_128( \
|
||||
xmm8 = _mm256_xor_si256( xmm8, mm256_bcast_m128( \
|
||||
casti_m128i( round_const_p, round_counter ) ) ); \
|
||||
/* ShiftBytes P1024 + pre-AESENCLAST */\
|
||||
xmm8 = _mm256_shuffle_epi8( xmm8, SUBSH_MASK0_2WAY ); \
|
||||
@@ -837,7 +837,7 @@ static const __m256i SUBSH_MASK7_2WAY =
|
||||
SUBMIX_2WAY(xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);\
|
||||
\
|
||||
/* AddRoundConstant P1024 */\
|
||||
xmm0 = _mm256_xor_si256( xmm0, m256_const1_128( \
|
||||
xmm0 = _mm256_xor_si256( xmm0, mm256_bcast_m128( \
|
||||
casti_m128i( round_const_p, round_counter+1 ) ) ); \
|
||||
/* ShiftBytes P1024 + pre-AESENCLAST */\
|
||||
xmm0 = _mm256_shuffle_epi8( xmm0, SUBSH_MASK0_2WAY );\
|
||||
@@ -866,7 +866,7 @@ static const __m256i SUBSH_MASK7_2WAY =
|
||||
xmm12 = _mm256_xor_si256( xmm12, xmm1 );\
|
||||
xmm13 = _mm256_xor_si256( xmm13, xmm1 );\
|
||||
xmm14 = _mm256_xor_si256( xmm14, xmm1 );\
|
||||
xmm15 = _mm256_xor_si256( xmm15, m256_const1_128( \
|
||||
xmm15 = _mm256_xor_si256( xmm15, mm256_bcast_m128( \
|
||||
casti_m128i( round_const_q, round_counter ) ) ); \
|
||||
/* ShiftBytes Q1024 + pre-AESENCLAST */\
|
||||
xmm8 = _mm256_shuffle_epi8( xmm8, SUBSH_MASK1_2WAY );\
|
||||
@@ -889,7 +889,7 @@ static const __m256i SUBSH_MASK7_2WAY =
|
||||
xmm4 = _mm256_xor_si256( xmm4, xmm9 );\
|
||||
xmm5 = _mm256_xor_si256( xmm5, xmm9 );\
|
||||
xmm6 = _mm256_xor_si256( xmm6, xmm9 );\
|
||||
xmm7 = _mm256_xor_si256( xmm7, m256_const1_128( \
|
||||
xmm7 = _mm256_xor_si256( xmm7, mm256_bcast_m128( \
|
||||
casti_m128i( round_const_q, round_counter+1 ) ) ); \
|
||||
/* ShiftBytes Q1024 + pre-AESENCLAST */\
|
||||
xmm0 = _mm256_shuffle_epi8( xmm0, SUBSH_MASK1_2WAY );\
|
||||
@@ -1040,8 +1040,8 @@ static const __m256i SUBSH_MASK7_2WAY =
|
||||
|
||||
void INIT_2way( __m256i *chaining )
|
||||
{
|
||||
static __m256i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
|
||||
static __m256i xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15;
|
||||
__m256i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
|
||||
__m256i xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15;
|
||||
|
||||
/* load IV into registers xmm8 - xmm15 */
|
||||
xmm8 = chaining[0];
|
||||
@@ -1069,12 +1069,12 @@ void INIT_2way( __m256i *chaining )
|
||||
|
||||
void TF1024_2way( __m256i *chaining, const __m256i *message )
|
||||
{
|
||||
static __m256i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
|
||||
static __m256i xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15;
|
||||
static __m256i QTEMP[8];
|
||||
static __m256i TEMP0;
|
||||
static __m256i TEMP1;
|
||||
static __m256i TEMP2;
|
||||
__m256i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
|
||||
__m256i xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15;
|
||||
__m256i QTEMP[8];
|
||||
__m256i TEMP0;
|
||||
__m256i TEMP1;
|
||||
__m256i TEMP2;
|
||||
|
||||
/* load message into registers xmm8 - xmm15 (Q = message) */
|
||||
xmm8 = message[0];
|
||||
@@ -1175,11 +1175,11 @@ void TF1024_2way( __m256i *chaining, const __m256i *message )
|
||||
|
||||
void OF1024_2way( __m256i* chaining )
|
||||
{
|
||||
static __m256i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
|
||||
static __m256i xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15;
|
||||
static __m256i TEMP0;
|
||||
static __m256i TEMP1;
|
||||
static __m256i TEMP2;
|
||||
__m256i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
|
||||
__m256i xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15;
|
||||
__m256i TEMP0;
|
||||
__m256i TEMP1;
|
||||
__m256i TEMP2;
|
||||
|
||||
/* load CV into registers xmm8 - xmm15 */
|
||||
xmm8 = chaining[0];
|
||||
|
||||
@@ -60,7 +60,7 @@ static const uint32 CNS_INIT[128] __attribute((aligned(64))) = {
|
||||
|
||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
|
||||
#define cns4w(i) m512_const1_128( ( (__m128i*)CNS_INIT)[i] )
|
||||
#define cns4w(i) mm512_bcast_m128( ( (__m128i*)CNS_INIT)[i] )
|
||||
|
||||
#define ADD_CONSTANT4W( a, b, c0, c1 ) \
|
||||
a = _mm512_xor_si512( a, c0 ); \
|
||||
@@ -154,11 +154,10 @@ static const uint32 CNS_INIT[128] __attribute((aligned(64))) = {
|
||||
#define MIXTON10244W(r0,r1,r2,r3,s0,s1,s2,s3,p0,p1,p2,p3,q0,q1,q2,q3)\
|
||||
NMLTOM10244W(r0,r1,r2,r3,s0,s1,s2,s3,p0,p1,p2,p3,q0,q1,q2,q3);
|
||||
|
||||
void rnd512_4way( luffa_4way_context *state, __m512i *msg )
|
||||
void rnd512_4way( luffa_4way_context *state, const __m512i *msg )
|
||||
{
|
||||
__m512i t0, t1;
|
||||
__m512i *chainv = state->chainv;
|
||||
__m512i msg0, msg1;
|
||||
__m512i x0, x1, x2, x3, x4, x5, x6, x7;
|
||||
|
||||
t0 = mm512_xor3( chainv[0], chainv[2], chainv[4] );
|
||||
@@ -168,9 +167,6 @@ void rnd512_4way( luffa_4way_context *state, __m512i *msg )
|
||||
|
||||
MULT24W( t0, t1 );
|
||||
|
||||
msg0 = _mm512_shuffle_epi32( msg[0], 27 );
|
||||
msg1 = _mm512_shuffle_epi32( msg[1], 27 );
|
||||
|
||||
chainv[0] = _mm512_xor_si512( chainv[0], t0 );
|
||||
chainv[1] = _mm512_xor_si512( chainv[1], t1 );
|
||||
chainv[2] = _mm512_xor_si512( chainv[2], t0 );
|
||||
@@ -225,27 +221,36 @@ void rnd512_4way( luffa_4way_context *state, __m512i *msg )
|
||||
chainv[3] = _mm512_xor_si512( chainv[3], chainv[1] );
|
||||
|
||||
MULT24W( chainv[0], chainv[1] );
|
||||
chainv[0] = mm512_xor3( chainv[0], t0, msg0 );
|
||||
chainv[1] = mm512_xor3( chainv[1], t1, msg1 );
|
||||
chainv[0] = _mm512_xor_si512( chainv[0], t0 );
|
||||
chainv[1] = _mm512_xor_si512( chainv[1], t1 );
|
||||
|
||||
MULT24W( msg0, msg1 );
|
||||
chainv[2] = _mm512_xor_si512( chainv[2], msg0 );
|
||||
chainv[3] = _mm512_xor_si512( chainv[3], msg1 );
|
||||
if ( msg )
|
||||
{
|
||||
__m512i msg0, msg1;
|
||||
|
||||
MULT24W( msg0, msg1 );
|
||||
chainv[4] = _mm512_xor_si512( chainv[4], msg0 );
|
||||
chainv[5] = _mm512_xor_si512( chainv[5], msg1 );
|
||||
msg0 = _mm512_shuffle_epi32( msg[0], 27 );
|
||||
msg1 = _mm512_shuffle_epi32( msg[1], 27 );
|
||||
|
||||
MULT24W( msg0, msg1 );
|
||||
chainv[6] = _mm512_xor_si512( chainv[6], msg0 );
|
||||
chainv[7] = _mm512_xor_si512( chainv[7], msg1 );
|
||||
chainv[0] = _mm512_xor_si512( chainv[0], msg0 );
|
||||
chainv[1] = _mm512_xor_si512( chainv[1], msg1 );
|
||||
|
||||
MULT24W( msg0, msg1);
|
||||
chainv[8] = _mm512_xor_si512( chainv[8], msg0 );
|
||||
chainv[9] = _mm512_xor_si512( chainv[9], msg1 );
|
||||
MULT24W( msg0, msg1 );
|
||||
chainv[2] = _mm512_xor_si512( chainv[2], msg0 );
|
||||
chainv[3] = _mm512_xor_si512( chainv[3], msg1 );
|
||||
|
||||
MULT24W( msg0, msg1 );
|
||||
MULT24W( msg0, msg1 );
|
||||
chainv[4] = _mm512_xor_si512( chainv[4], msg0 );
|
||||
chainv[5] = _mm512_xor_si512( chainv[5], msg1 );
|
||||
|
||||
MULT24W( msg0, msg1 );
|
||||
chainv[6] = _mm512_xor_si512( chainv[6], msg0 );
|
||||
chainv[7] = _mm512_xor_si512( chainv[7], msg1 );
|
||||
|
||||
MULT24W( msg0, msg1);
|
||||
chainv[8] = _mm512_xor_si512( chainv[8], msg0 );
|
||||
chainv[9] = _mm512_xor_si512( chainv[9], msg1 );
|
||||
}
|
||||
|
||||
chainv[3] = _mm512_rol_epi32( chainv[3], 1 );
|
||||
chainv[5] = _mm512_rol_epi32( chainv[5], 2 );
|
||||
chainv[7] = _mm512_rol_epi32( chainv[7], 3 );
|
||||
@@ -282,16 +287,11 @@ void finalization512_4way( luffa_4way_context *state, uint32 *b )
|
||||
uint32_t hash[8*4] __attribute((aligned(128)));
|
||||
__m512i* chainv = state->chainv;
|
||||
__m512i t[2];
|
||||
__m512i zero[2];
|
||||
zero[0] = zero[1] = m512_zero;
|
||||
const __m512i shuff_bswap32 = m512_const_64(
|
||||
0x3c3d3e3f38393a3b, 0x3435363730313233,
|
||||
0x2c2d2e2f28292a2b, 0x2425262720212223,
|
||||
0x1c1d1e1f18191a1b, 0x1415161710111213,
|
||||
0x0c0d0e0f08090a0b, 0x0405060700010203 );
|
||||
const __m512i shuff_bswap32 = mm512_bcast_m128( _mm_set_epi64x(
|
||||
0x0c0d0e0f08090a0b, 0x0405060700010203 ) );
|
||||
|
||||
/*---- blank round with m=0 ----*/
|
||||
rnd512_4way( state, zero );
|
||||
rnd512_4way( state, NULL );
|
||||
|
||||
t[0] = mm512_xor3( chainv[0], chainv[2], chainv[4] );
|
||||
t[1] = mm512_xor3( chainv[1], chainv[3], chainv[5] );
|
||||
@@ -300,37 +300,30 @@ void finalization512_4way( luffa_4way_context *state, uint32 *b )
|
||||
t[0] = _mm512_shuffle_epi32( t[0], 27 );
|
||||
t[1] = _mm512_shuffle_epi32( t[1], 27 );
|
||||
|
||||
_mm512_store_si512( (__m512i*)&hash[0], t[0] );
|
||||
_mm512_store_si512( (__m512i*)&hash[ 0], t[0] );
|
||||
_mm512_store_si512( (__m512i*)&hash[16], t[1] );
|
||||
|
||||
casti_m512i( b, 0 ) = _mm512_shuffle_epi8(
|
||||
casti_m512i( hash, 0 ), shuff_bswap32 );
|
||||
casti_m512i( b, 1 ) = _mm512_shuffle_epi8(
|
||||
casti_m512i( hash, 1 ), shuff_bswap32 );
|
||||
casti_m512i( b,0 ) = _mm512_shuffle_epi8(
|
||||
casti_m512i( hash,0 ), shuff_bswap32 );
|
||||
casti_m512i( b,1 ) = _mm512_shuffle_epi8(
|
||||
casti_m512i( hash,1 ), shuff_bswap32 );
|
||||
|
||||
rnd512_4way( state, zero );
|
||||
|
||||
t[0] = chainv[0];
|
||||
t[1] = chainv[1];
|
||||
t[0] = _mm512_xor_si512( t[0], chainv[2] );
|
||||
t[1] = _mm512_xor_si512( t[1], chainv[3] );
|
||||
t[0] = _mm512_xor_si512( t[0], chainv[4] );
|
||||
t[1] = _mm512_xor_si512( t[1], chainv[5] );
|
||||
t[0] = _mm512_xor_si512( t[0], chainv[6] );
|
||||
t[1] = _mm512_xor_si512( t[1], chainv[7] );
|
||||
t[0] = _mm512_xor_si512( t[0], chainv[8] );
|
||||
t[1] = _mm512_xor_si512( t[1], chainv[9] );
|
||||
rnd512_4way( state, NULL );
|
||||
|
||||
t[0] = mm512_xor3( chainv[0], chainv[2], chainv[4] );
|
||||
t[1] = mm512_xor3( chainv[1], chainv[3], chainv[5] );
|
||||
t[0] = mm512_xor3( t[0], chainv[6], chainv[8] );
|
||||
t[1] = mm512_xor3( t[1], chainv[7], chainv[9] );
|
||||
t[0] = _mm512_shuffle_epi32( t[0], 27 );
|
||||
t[1] = _mm512_shuffle_epi32( t[1], 27 );
|
||||
|
||||
_mm512_store_si512( (__m512i*)&hash[0], t[0] );
|
||||
_mm512_store_si512( (__m512i*)&hash[ 0], t[0] );
|
||||
_mm512_store_si512( (__m512i*)&hash[16], t[1] );
|
||||
|
||||
casti_m512i( b, 2 ) = _mm512_shuffle_epi8(
|
||||
casti_m512i( hash, 0 ), shuff_bswap32 );
|
||||
casti_m512i( b, 3 ) = _mm512_shuffle_epi8(
|
||||
casti_m512i( hash, 1 ), shuff_bswap32 );
|
||||
casti_m512i( b,2 ) = _mm512_shuffle_epi8(
|
||||
casti_m512i( hash,0 ), shuff_bswap32 );
|
||||
casti_m512i( b,3 ) = _mm512_shuffle_epi8(
|
||||
casti_m512i( hash,1 ), shuff_bswap32 );
|
||||
}
|
||||
|
||||
int luffa_4way_init( luffa_4way_context *state, int hashbitlen )
|
||||
@@ -338,16 +331,16 @@ int luffa_4way_init( luffa_4way_context *state, int hashbitlen )
|
||||
state->hashbitlen = hashbitlen;
|
||||
__m128i *iv = (__m128i*)IV;
|
||||
|
||||
state->chainv[0] = m512_const1_128( iv[0] );
|
||||
state->chainv[1] = m512_const1_128( iv[1] );
|
||||
state->chainv[2] = m512_const1_128( iv[2] );
|
||||
state->chainv[3] = m512_const1_128( iv[3] );
|
||||
state->chainv[4] = m512_const1_128( iv[4] );
|
||||
state->chainv[5] = m512_const1_128( iv[5] );
|
||||
state->chainv[6] = m512_const1_128( iv[6] );
|
||||
state->chainv[7] = m512_const1_128( iv[7] );
|
||||
state->chainv[8] = m512_const1_128( iv[8] );
|
||||
state->chainv[9] = m512_const1_128( iv[9] );
|
||||
state->chainv[0] = mm512_bcast_m128( iv[0] );
|
||||
state->chainv[1] = mm512_bcast_m128( iv[1] );
|
||||
state->chainv[2] = mm512_bcast_m128( iv[2] );
|
||||
state->chainv[3] = mm512_bcast_m128( iv[3] );
|
||||
state->chainv[4] = mm512_bcast_m128( iv[4] );
|
||||
state->chainv[5] = mm512_bcast_m128( iv[5] );
|
||||
state->chainv[6] = mm512_bcast_m128( iv[6] );
|
||||
state->chainv[7] = mm512_bcast_m128( iv[7] );
|
||||
state->chainv[8] = mm512_bcast_m128( iv[8] );
|
||||
state->chainv[9] = mm512_bcast_m128( iv[9] );
|
||||
|
||||
((__m512i*)state->buffer)[0] = m512_zero;
|
||||
((__m512i*)state->buffer)[1] = m512_zero;
|
||||
@@ -370,11 +363,8 @@ int luffa_4way_update( luffa_4way_context *state, const void *data,
|
||||
__m512i msg[2];
|
||||
int i;
|
||||
int blocks = (int)len >> 5;
|
||||
const __m512i shuff_bswap32 = m512_const_64(
|
||||
0x3c3d3e3f38393a3b, 0x3435363730313233,
|
||||
0x2c2d2e2f28292a2b, 0x2425262720212223,
|
||||
0x1c1d1e1f18191a1b, 0x1415161710111213,
|
||||
0x0c0d0e0f08090a0b, 0x0405060700010203 );
|
||||
const __m512i shuff_bswap32 = mm512_bcast_m128( _mm_set_epi64x(
|
||||
0x0c0d0e0f08090a0b, 0x0405060700010203 ) );
|
||||
|
||||
state->rembytes = (int)len & 0x1F;
|
||||
|
||||
@@ -392,7 +382,7 @@ int luffa_4way_update( luffa_4way_context *state, const void *data,
|
||||
{
|
||||
// remaining data bytes
|
||||
buffer[0] = _mm512_shuffle_epi8( vdata[0], shuff_bswap32 );
|
||||
buffer[1] = m512_const1_i128( 0x0000000080000000 );
|
||||
buffer[1] = mm512_bcast128lo_64( 0x0000000080000000 );
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
@@ -416,7 +406,7 @@ int luffa_4way_close( luffa_4way_context *state, void *hashval )
|
||||
rnd512_4way( state, buffer );
|
||||
else
|
||||
{ // empty pad block, constant data
|
||||
msg[0] = m512_const1_i128( 0x0000000080000000 );
|
||||
msg[0] = mm512_bcast128lo_64( 0x0000000080000000 );
|
||||
msg[1] = m512_zero;
|
||||
rnd512_4way( state, msg );
|
||||
}
|
||||
@@ -440,16 +430,16 @@ int luffa512_4way_full( luffa_4way_context *state, void *output,
|
||||
state->hashbitlen = 512;
|
||||
__m128i *iv = (__m128i*)IV;
|
||||
|
||||
state->chainv[0] = m512_const1_128( iv[0] );
|
||||
state->chainv[1] = m512_const1_128( iv[1] );
|
||||
state->chainv[2] = m512_const1_128( iv[2] );
|
||||
state->chainv[3] = m512_const1_128( iv[3] );
|
||||
state->chainv[4] = m512_const1_128( iv[4] );
|
||||
state->chainv[5] = m512_const1_128( iv[5] );
|
||||
state->chainv[6] = m512_const1_128( iv[6] );
|
||||
state->chainv[7] = m512_const1_128( iv[7] );
|
||||
state->chainv[8] = m512_const1_128( iv[8] );
|
||||
state->chainv[9] = m512_const1_128( iv[9] );
|
||||
state->chainv[0] = mm512_bcast_m128( iv[0] );
|
||||
state->chainv[1] = mm512_bcast_m128( iv[1] );
|
||||
state->chainv[2] = mm512_bcast_m128( iv[2] );
|
||||
state->chainv[3] = mm512_bcast_m128( iv[3] );
|
||||
state->chainv[4] = mm512_bcast_m128( iv[4] );
|
||||
state->chainv[5] = mm512_bcast_m128( iv[5] );
|
||||
state->chainv[6] = mm512_bcast_m128( iv[6] );
|
||||
state->chainv[7] = mm512_bcast_m128( iv[7] );
|
||||
state->chainv[8] = mm512_bcast_m128( iv[8] );
|
||||
state->chainv[9] = mm512_bcast_m128( iv[9] );
|
||||
|
||||
((__m512i*)state->buffer)[0] = m512_zero;
|
||||
((__m512i*)state->buffer)[1] = m512_zero;
|
||||
@@ -458,11 +448,8 @@ int luffa512_4way_full( luffa_4way_context *state, void *output,
|
||||
__m512i msg[2];
|
||||
int i;
|
||||
const int blocks = (int)( inlen >> 5 );
|
||||
const __m512i shuff_bswap32 = m512_const_64(
|
||||
0x3c3d3e3f38393a3b, 0x3435363730313233,
|
||||
0x2c2d2e2f28292a2b, 0x2425262720212223,
|
||||
0x1c1d1e1f18191a1b, 0x1415161710111213,
|
||||
0x0c0d0e0f08090a0b, 0x0405060700010203 );
|
||||
const __m512i shuff_bswap32 = mm512_bcast_m128( _mm_set_epi64x(
|
||||
0x0c0d0e0f08090a0b, 0x0405060700010203 ) );
|
||||
|
||||
state->rembytes = inlen & 0x1F;
|
||||
|
||||
@@ -479,13 +466,13 @@ int luffa512_4way_full( luffa_4way_context *state, void *output,
|
||||
{
|
||||
// padding of partial block
|
||||
msg[0] = _mm512_shuffle_epi8( vdata[ 0 ], shuff_bswap32 );
|
||||
msg[1] = m512_const1_i128( 0x0000000080000000 );
|
||||
msg[1] = mm512_bcast128lo_64( 0x0000000080000000 );
|
||||
rnd512_4way( state, msg );
|
||||
}
|
||||
else
|
||||
{
|
||||
// empty pad block
|
||||
msg[0] = m512_const1_i128( 0x0000000080000000 );
|
||||
msg[0] = mm512_bcast128lo_64( 0x0000000080000000 );
|
||||
msg[1] = m512_zero;
|
||||
rnd512_4way( state, msg );
|
||||
}
|
||||
@@ -506,11 +493,8 @@ int luffa_4way_update_close( luffa_4way_context *state,
|
||||
__m512i msg[2];
|
||||
int i;
|
||||
const int blocks = (int)( inlen >> 5 );
|
||||
const __m512i shuff_bswap32 = m512_const_64(
|
||||
0x3c3d3e3f38393a3b, 0x3435363730313233,
|
||||
0x2c2d2e2f28292a2b, 0x2425262720212223,
|
||||
0x1c1d1e1f18191a1b, 0x1415161710111213,
|
||||
0x0c0d0e0f08090a0b, 0x0405060700010203 );
|
||||
const __m512i shuff_bswap32 = mm512_bcast_m128( _mm_set_epi64x(
|
||||
0x0c0d0e0f08090a0b, 0x0405060700010203 ) );
|
||||
|
||||
state->rembytes = inlen & 0x1F;
|
||||
|
||||
@@ -527,13 +511,13 @@ int luffa_4way_update_close( luffa_4way_context *state,
|
||||
{
|
||||
// padding of partial block
|
||||
msg[0] = _mm512_shuffle_epi8( vdata[ 0 ], shuff_bswap32 );
|
||||
msg[1] = m512_const1_i128( 0x0000000080000000 );
|
||||
msg[1] = mm512_bcast128lo_64( 0x0000000080000000 );
|
||||
rnd512_4way( state, msg );
|
||||
}
|
||||
else
|
||||
{
|
||||
// empty pad block
|
||||
msg[0] = m512_const1_i128( 0x0000000080000000 );
|
||||
msg[0] = mm512_bcast128lo_64( 0x0000000080000000 );
|
||||
msg[1] = m512_zero;
|
||||
rnd512_4way( state, msg );
|
||||
}
|
||||
@@ -548,7 +532,7 @@ int luffa_4way_update_close( luffa_4way_context *state,
|
||||
|
||||
#endif // AVX512
|
||||
|
||||
#define cns(i) m256_const1_128( ( (__m128i*)CNS_INIT)[i] )
|
||||
#define cns(i) mm256_bcast_m128( ( (__m128i*)CNS_INIT)[i] )
|
||||
|
||||
#define ADD_CONSTANT( a, b, c0, c1 ) \
|
||||
a = _mm256_xor_si256( a, c0 ); \
|
||||
@@ -666,11 +650,10 @@ int luffa_4way_update_close( luffa_4way_context *state,
|
||||
/* Round function */
|
||||
/* state: hash context */
|
||||
|
||||
void rnd512_2way( luffa_2way_context *state, __m256i *msg )
|
||||
void rnd512_2way( luffa_2way_context *state, const __m256i *msg )
|
||||
{
|
||||
__m256i t0, t1;
|
||||
__m256i *chainv = state->chainv;
|
||||
__m256i msg0, msg1;
|
||||
__m256i x0, x1, x2, x3, x4, x5, x6, x7;
|
||||
|
||||
t0 = chainv[0];
|
||||
@@ -687,9 +670,6 @@ void rnd512_2way( luffa_2way_context *state, __m256i *msg )
|
||||
|
||||
MULT2( t0, t1 );
|
||||
|
||||
msg0 = _mm256_shuffle_epi32( msg[0], 27 );
|
||||
msg1 = _mm256_shuffle_epi32( msg[1], 27 );
|
||||
|
||||
chainv[0] = _mm256_xor_si256( chainv[0], t0 );
|
||||
chainv[1] = _mm256_xor_si256( chainv[1], t1 );
|
||||
chainv[2] = _mm256_xor_si256( chainv[2], t0 );
|
||||
@@ -744,26 +724,35 @@ void rnd512_2way( luffa_2way_context *state, __m256i *msg )
|
||||
chainv[3] = _mm256_xor_si256( chainv[3], chainv[1] );
|
||||
|
||||
MULT2( chainv[0], chainv[1] );
|
||||
chainv[0] = _mm256_xor_si256( _mm256_xor_si256( chainv[0], t0 ), msg0 );
|
||||
chainv[1] = _mm256_xor_si256( _mm256_xor_si256( chainv[1], t1 ), msg1 );
|
||||
chainv[0] = _mm256_xor_si256( chainv[0], t0 );
|
||||
chainv[1] = _mm256_xor_si256( chainv[1], t1 );
|
||||
|
||||
MULT2( msg0, msg1 );
|
||||
chainv[2] = _mm256_xor_si256( chainv[2], msg0 );
|
||||
chainv[3] = _mm256_xor_si256( chainv[3], msg1 );
|
||||
if ( msg )
|
||||
{
|
||||
__m256i msg0, msg1;
|
||||
|
||||
msg0 = _mm256_shuffle_epi32( msg[0], 27 );
|
||||
msg1 = _mm256_shuffle_epi32( msg[1], 27 );
|
||||
|
||||
MULT2( msg0, msg1 );
|
||||
chainv[4] = _mm256_xor_si256( chainv[4], msg0 );
|
||||
chainv[5] = _mm256_xor_si256( chainv[5], msg1 );
|
||||
chainv[0] = _mm256_xor_si256( chainv[0], msg0 );
|
||||
chainv[1] = _mm256_xor_si256( chainv[1], msg1 );
|
||||
|
||||
MULT2( msg0, msg1 );
|
||||
chainv[2] = _mm256_xor_si256( chainv[2], msg0 );
|
||||
chainv[3] = _mm256_xor_si256( chainv[3], msg1 );
|
||||
|
||||
MULT2( msg0, msg1 );
|
||||
chainv[6] = _mm256_xor_si256( chainv[6], msg0 );
|
||||
chainv[7] = _mm256_xor_si256( chainv[7], msg1 );
|
||||
MULT2( msg0, msg1 );
|
||||
chainv[4] = _mm256_xor_si256( chainv[4], msg0 );
|
||||
chainv[5] = _mm256_xor_si256( chainv[5], msg1 );
|
||||
|
||||
MULT2( msg0, msg1 );
|
||||
chainv[8] = _mm256_xor_si256( chainv[8], msg0 );
|
||||
chainv[9] = _mm256_xor_si256( chainv[9], msg1 );
|
||||
MULT2( msg0, msg1 );
|
||||
chainv[6] = _mm256_xor_si256( chainv[6], msg0 );
|
||||
chainv[7] = _mm256_xor_si256( chainv[7], msg1 );
|
||||
|
||||
MULT2( msg0, msg1 );
|
||||
MULT2( msg0, msg1 );
|
||||
chainv[8] = _mm256_xor_si256( chainv[8], msg0 );
|
||||
chainv[9] = _mm256_xor_si256( chainv[9], msg1 );
|
||||
}
|
||||
|
||||
chainv[3] = mm256_rol_32( chainv[3], 1 );
|
||||
chainv[5] = mm256_rol_32( chainv[5], 2 );
|
||||
@@ -806,14 +795,10 @@ void finalization512_2way( luffa_2way_context *state, uint32 *b )
|
||||
uint32 hash[8*2] __attribute((aligned(64)));
|
||||
__m256i* chainv = state->chainv;
|
||||
__m256i t[2];
|
||||
__m256i zero[2];
|
||||
zero[0] = zero[1] = m256_zero;
|
||||
const __m256i shuff_bswap32 = m256_const_64( 0x1c1d1e1f18191a1b,
|
||||
0x1415161710111213,
|
||||
0x0c0d0e0f08090a0b,
|
||||
const __m256i shuff_bswap32 = mm256_set2_64( 0x0c0d0e0f08090a0b,
|
||||
0x0405060700010203 );
|
||||
/*---- blank round with m=0 ----*/
|
||||
rnd512_2way( state, zero );
|
||||
rnd512_2way( state, NULL );
|
||||
|
||||
t[0] = chainv[0];
|
||||
t[1] = chainv[1];
|
||||
@@ -838,7 +823,7 @@ void finalization512_2way( luffa_2way_context *state, uint32 *b )
|
||||
casti_m256i( b, 1 ) = _mm256_shuffle_epi8(
|
||||
casti_m256i( hash, 1 ), shuff_bswap32 );
|
||||
|
||||
rnd512_2way( state, zero );
|
||||
rnd512_2way( state, NULL );
|
||||
|
||||
t[0] = chainv[0];
|
||||
t[1] = chainv[1];
|
||||
@@ -868,16 +853,16 @@ int luffa_2way_init( luffa_2way_context *state, int hashbitlen )
|
||||
state->hashbitlen = hashbitlen;
|
||||
__m128i *iv = (__m128i*)IV;
|
||||
|
||||
state->chainv[0] = m256_const1_128( iv[0] );
|
||||
state->chainv[1] = m256_const1_128( iv[1] );
|
||||
state->chainv[2] = m256_const1_128( iv[2] );
|
||||
state->chainv[3] = m256_const1_128( iv[3] );
|
||||
state->chainv[4] = m256_const1_128( iv[4] );
|
||||
state->chainv[5] = m256_const1_128( iv[5] );
|
||||
state->chainv[6] = m256_const1_128( iv[6] );
|
||||
state->chainv[7] = m256_const1_128( iv[7] );
|
||||
state->chainv[8] = m256_const1_128( iv[8] );
|
||||
state->chainv[9] = m256_const1_128( iv[9] );
|
||||
state->chainv[0] = mm256_bcast_m128( iv[0] );
|
||||
state->chainv[1] = mm256_bcast_m128( iv[1] );
|
||||
state->chainv[2] = mm256_bcast_m128( iv[2] );
|
||||
state->chainv[3] = mm256_bcast_m128( iv[3] );
|
||||
state->chainv[4] = mm256_bcast_m128( iv[4] );
|
||||
state->chainv[5] = mm256_bcast_m128( iv[5] );
|
||||
state->chainv[6] = mm256_bcast_m128( iv[6] );
|
||||
state->chainv[7] = mm256_bcast_m128( iv[7] );
|
||||
state->chainv[8] = mm256_bcast_m128( iv[8] );
|
||||
state->chainv[9] = mm256_bcast_m128( iv[9] );
|
||||
|
||||
((__m256i*)state->buffer)[0] = m256_zero;
|
||||
((__m256i*)state->buffer)[1] = m256_zero;
|
||||
@@ -895,9 +880,7 @@ int luffa_2way_update( luffa_2way_context *state, const void *data,
|
||||
__m256i msg[2];
|
||||
int i;
|
||||
int blocks = (int)len >> 5;
|
||||
const __m256i shuff_bswap32 = m256_const_64( 0x1c1d1e1f18191a1b,
|
||||
0x1415161710111213,
|
||||
0x0c0d0e0f08090a0b,
|
||||
const __m256i shuff_bswap32 = mm256_set2_64( 0x0c0d0e0f08090a0b,
|
||||
0x0405060700010203 );
|
||||
state-> rembytes = (int)len & 0x1F;
|
||||
|
||||
@@ -915,7 +898,7 @@ int luffa_2way_update( luffa_2way_context *state, const void *data,
|
||||
{
|
||||
// remaining data bytes
|
||||
buffer[0] = _mm256_shuffle_epi8( vdata[0], shuff_bswap32 );
|
||||
buffer[1] = m256_const1_i128( 0x0000000080000000 );
|
||||
buffer[1] = mm256_bcast128lo_64( 0x0000000080000000 );
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
@@ -931,7 +914,7 @@ int luffa_2way_close( luffa_2way_context *state, void *hashval )
|
||||
rnd512_2way( state, buffer );
|
||||
else
|
||||
{ // empty pad block, constant data
|
||||
msg[0] = m256_const1_i128( 0x0000000080000000 );
|
||||
msg[0] = mm256_bcast128lo_64( 0x0000000080000000 );
|
||||
msg[1] = m256_zero;
|
||||
rnd512_2way( state, msg );
|
||||
}
|
||||
@@ -948,16 +931,16 @@ int luffa512_2way_full( luffa_2way_context *state, void *output,
|
||||
state->hashbitlen = 512;
|
||||
__m128i *iv = (__m128i*)IV;
|
||||
|
||||
state->chainv[0] = m256_const1_128( iv[0] );
|
||||
state->chainv[1] = m256_const1_128( iv[1] );
|
||||
state->chainv[2] = m256_const1_128( iv[2] );
|
||||
state->chainv[3] = m256_const1_128( iv[3] );
|
||||
state->chainv[4] = m256_const1_128( iv[4] );
|
||||
state->chainv[5] = m256_const1_128( iv[5] );
|
||||
state->chainv[6] = m256_const1_128( iv[6] );
|
||||
state->chainv[7] = m256_const1_128( iv[7] );
|
||||
state->chainv[8] = m256_const1_128( iv[8] );
|
||||
state->chainv[9] = m256_const1_128( iv[9] );
|
||||
state->chainv[0] = mm256_bcast_m128( iv[0] );
|
||||
state->chainv[1] = mm256_bcast_m128( iv[1] );
|
||||
state->chainv[2] = mm256_bcast_m128( iv[2] );
|
||||
state->chainv[3] = mm256_bcast_m128( iv[3] );
|
||||
state->chainv[4] = mm256_bcast_m128( iv[4] );
|
||||
state->chainv[5] = mm256_bcast_m128( iv[5] );
|
||||
state->chainv[6] = mm256_bcast_m128( iv[6] );
|
||||
state->chainv[7] = mm256_bcast_m128( iv[7] );
|
||||
state->chainv[8] = mm256_bcast_m128( iv[8] );
|
||||
state->chainv[9] = mm256_bcast_m128( iv[9] );
|
||||
|
||||
((__m256i*)state->buffer)[0] = m256_zero;
|
||||
((__m256i*)state->buffer)[1] = m256_zero;
|
||||
@@ -966,9 +949,7 @@ int luffa512_2way_full( luffa_2way_context *state, void *output,
|
||||
__m256i msg[2];
|
||||
int i;
|
||||
const int blocks = (int)( inlen >> 5 );
|
||||
const __m256i shuff_bswap32 = m256_const_64( 0x1c1d1e1f18191a1b,
|
||||
0x1415161710111213,
|
||||
0x0c0d0e0f08090a0b,
|
||||
const __m256i shuff_bswap32 = mm256_set2_64( 0x0c0d0e0f08090a0b,
|
||||
0x0405060700010203 );
|
||||
|
||||
state->rembytes = inlen & 0x1F;
|
||||
@@ -986,13 +967,13 @@ int luffa512_2way_full( luffa_2way_context *state, void *output,
|
||||
{
|
||||
// padding of partial block
|
||||
msg[0] = _mm256_shuffle_epi8( vdata[ 0 ], shuff_bswap32 );
|
||||
msg[1] = m256_const1_i128( 0x0000000080000000 );
|
||||
msg[1] = mm256_bcast128lo_64( 0x0000000080000000 );
|
||||
rnd512_2way( state, msg );
|
||||
}
|
||||
else
|
||||
{
|
||||
// empty pad block
|
||||
msg[0] = m256_const1_i128( 0x0000000080000000 );
|
||||
msg[0] = mm256_bcast128lo_64( 0x0000000080000000 );
|
||||
msg[1] = m256_zero;
|
||||
rnd512_2way( state, msg );
|
||||
}
|
||||
@@ -1013,9 +994,7 @@ int luffa_2way_update_close( luffa_2way_context *state,
|
||||
__m256i msg[2];
|
||||
int i;
|
||||
const int blocks = (int)( inlen >> 5 );
|
||||
const __m256i shuff_bswap32 = m256_const_64( 0x1c1d1e1f18191a1b,
|
||||
0x1415161710111213,
|
||||
0x0c0d0e0f08090a0b,
|
||||
const __m256i shuff_bswap32 = mm256_set2_64( 0x0c0d0e0f08090a0b,
|
||||
0x0405060700010203 );
|
||||
|
||||
state->rembytes = inlen & 0x1F;
|
||||
@@ -1033,13 +1012,13 @@ int luffa_2way_update_close( luffa_2way_context *state,
|
||||
{
|
||||
// padding of partial block
|
||||
msg[0] = _mm256_shuffle_epi8( vdata[ 0 ], shuff_bswap32 );
|
||||
msg[1] = m256_const1_i128( 0x0000000080000000 );
|
||||
msg[1] = mm256_bcast128lo_64( 0x0000000080000000 );
|
||||
rnd512_2way( state, msg );
|
||||
}
|
||||
else
|
||||
{
|
||||
// empty pad block
|
||||
msg[0] = m256_const1_i128( 0x0000000080000000 );
|
||||
msg[0] = mm256_bcast128lo_64( 0x0000000080000000 );
|
||||
msg[1] = m256_zero;
|
||||
rnd512_2way( state, msg );
|
||||
}
|
||||
|
||||
@@ -354,11 +354,11 @@ HashReturn update_and_final_luffa( hashState_luffa *state, BitSequence* output,
|
||||
// 16 byte partial block exists for 80 byte len
|
||||
if ( state->rembytes )
|
||||
// padding of partial block
|
||||
rnd512( state, m128_const_i128( 0x80000000 ),
|
||||
rnd512( state, mm128_mov64_128( 0x80000000 ),
|
||||
mm128_bswap_32( cast_m128i( data ) ) );
|
||||
else
|
||||
// empty pad block
|
||||
rnd512( state, m128_zero, m128_const_i128( 0x80000000 ) );
|
||||
rnd512( state, m128_zero, mm128_mov64_128( 0x80000000 ) );
|
||||
|
||||
finalization512( state, (uint32*) output );
|
||||
if ( state->hashbitlen > 512 )
|
||||
@@ -403,11 +403,11 @@ int luffa_full( hashState_luffa *state, BitSequence* output, int hashbitlen,
|
||||
// 16 byte partial block exists for 80 byte len
|
||||
if ( state->rembytes )
|
||||
// padding of partial block
|
||||
rnd512( state, m128_const_i128( 0x80000000 ),
|
||||
rnd512( state, mm128_mov64_128( 0x80000000 ),
|
||||
mm128_bswap_32( cast_m128i( data ) ) );
|
||||
else
|
||||
// empty pad block
|
||||
rnd512( state, m128_zero, m128_const_i128( 0x80000000 ) );
|
||||
rnd512( state, m128_zero, mm128_mov64_128( 0x80000000 ) );
|
||||
|
||||
finalization512( state, (uint32*) output );
|
||||
if ( state->hashbitlen > 512 )
|
||||
@@ -596,10 +596,10 @@ static void finalization512( hashState_luffa *state, uint32 *b )
|
||||
__m256i* chainv = (__m256i*)state->chainv;
|
||||
__m256i t;
|
||||
const __m128i zero = m128_zero;
|
||||
const __m256i shuff_bswap32 = m256_const_64( 0x1c1d1e1f18191a1b,
|
||||
0x1415161710111213,
|
||||
0x0c0d0e0f08090a0b,
|
||||
0x0405060700010203 );
|
||||
const __m256i shuff_bswap32 = _mm256_set_epi64x( 0x1c1d1e1f18191a1b,
|
||||
0x1415161710111213,
|
||||
0x0c0d0e0f08090a0b,
|
||||
0x0405060700010203 );
|
||||
|
||||
rnd512( state, zero, zero );
|
||||
|
||||
|
||||
@@ -85,10 +85,10 @@ inline void absorbBlockBlake2Safe_2way( uint64_t *State, const uint64_t *In,
|
||||
|
||||
state0 =
|
||||
state1 = m512_zero;
|
||||
state2 = m512_const4_64( 0xa54ff53a5f1d36f1ULL, 0x3c6ef372fe94f82bULL,
|
||||
0xbb67ae8584caa73bULL, 0x6a09e667f3bcc908ULL );
|
||||
state3 = m512_const4_64( 0x5be0cd19137e2179ULL, 0x1f83d9abfb41bd6bULL,
|
||||
0x9b05688c2b3e6c1fULL, 0x510e527fade682d1ULL );
|
||||
state2 = _mm512_set4_epi64( 0xa54ff53a5f1d36f1ULL, 0x3c6ef372fe94f82bULL,
|
||||
0xbb67ae8584caa73bULL, 0x6a09e667f3bcc908ULL );
|
||||
state3 = _mm512_set4_epi64( 0x5be0cd19137e2179ULL, 0x1f83d9abfb41bd6bULL,
|
||||
0x9b05688c2b3e6c1fULL, 0x510e527fade682d1ULL );
|
||||
|
||||
for ( int i = 0; i < nBlocks; i++ )
|
||||
{
|
||||
|
||||
@@ -41,17 +41,17 @@
|
||||
inline void initState( uint64_t State[/*16*/] )
|
||||
{
|
||||
|
||||
/*
|
||||
/*
|
||||
#if defined (__AVX2__)
|
||||
|
||||
__m256i* state = (__m256i*)State;
|
||||
const __m256i zero = m256_zero;
|
||||
state[0] = zero;
|
||||
state[1] = zero;
|
||||
state[2] = m256_const_64( 0xa54ff53a5f1d36f1ULL, 0x3c6ef372fe94f82bULL,
|
||||
0xbb67ae8584caa73bULL, 0x6a09e667f3bcc908ULL );
|
||||
state[3] = m256_const_64( 0x5be0cd19137e2179ULL, 0x1f83d9abfb41bd6bULL,
|
||||
0x9b05688c2b3e6c1fULL, 0x510e527fade682d1ULL );
|
||||
state[2] = _mm256_set_epi64x( 0xa54ff53a5f1d36f1ULL, 0x3c6ef372fe94f82bULL,
|
||||
0xbb67ae8584caa73bULL, 0x6a09e667f3bcc908ULL );
|
||||
state[3] = _mm256_set_epi64x( 0x5be0cd19137e2179ULL, 0x1f83d9abfb41bd6bULL,
|
||||
0x9b05688c2b3e6c1fULL, 0x510e527fade682d1ULL );
|
||||
|
||||
#elif defined (__SSE2__)
|
||||
|
||||
@@ -271,10 +271,10 @@ inline void absorbBlockBlake2Safe( uint64_t *State, const uint64_t *In,
|
||||
|
||||
state0 =
|
||||
state1 = m256_zero;
|
||||
state2 = m256_const_64( 0xa54ff53a5f1d36f1ULL, 0x3c6ef372fe94f82bULL,
|
||||
0xbb67ae8584caa73bULL, 0x6a09e667f3bcc908ULL );
|
||||
state3 = m256_const_64( 0x5be0cd19137e2179ULL, 0x1f83d9abfb41bd6bULL,
|
||||
0x9b05688c2b3e6c1fULL, 0x510e527fade682d1ULL );
|
||||
state2 = _mm256_set_epi64x( 0xa54ff53a5f1d36f1ULL, 0x3c6ef372fe94f82bULL,
|
||||
0xbb67ae8584caa73bULL, 0x6a09e667f3bcc908ULL );
|
||||
state3 = _mm256_set_epi64x( 0x5be0cd19137e2179ULL, 0x1f83d9abfb41bd6bULL,
|
||||
0x9b05688c2b3e6c1fULL, 0x510e527fade682d1ULL );
|
||||
|
||||
for ( int i = 0; i < nBlocks; i++ )
|
||||
{
|
||||
|
||||
@@ -36,31 +36,31 @@ int scanhash_sha256dt_16way( struct work *work, const uint32_t max_nonce,
|
||||
__m512i *noncev = vdata + 19;
|
||||
const int thr_id = mythr->id;
|
||||
const bool bench = opt_benchmark;
|
||||
const __m512i last_byte = m512_const1_32( 0x80000000 );
|
||||
const __m512i sixteen = m512_const1_32( 16 );
|
||||
const __m512i last_byte = _mm512_set1_epi32( 0x80000000 );
|
||||
const __m512i sixteen = _mm512_set1_epi32( 16 );
|
||||
|
||||
for ( int i = 0; i < 19; i++ )
|
||||
vdata[i] = mm512_bcast_i32( pdata[i] );
|
||||
vdata[i] = _mm512_set1_epi32( pdata[i] );
|
||||
|
||||
*noncev = _mm512_set_epi32( n+15, n+14, n+13, n+12, n+11, n+10, n+9, n+8,
|
||||
n+ 7, n+ 6, n+ 5, n+ 4, n+ 3, n+ 2, n+1, n );
|
||||
|
||||
vdata[16+4] = last_byte;
|
||||
memset_zero_512( vdata+16 + 5, 10 );
|
||||
vdata[16+15] = mm512_bcast_i32( 0x480 );
|
||||
vdata[16+15] = _mm512_set1_epi32( 0x480 );
|
||||
|
||||
block[ 8] = last_byte;
|
||||
memset_zero_512( block + 9, 6 );
|
||||
block[15] = mm512_bcast_i32( 0x300 );
|
||||
block[15] = _mm512_set1_epi32( 0x300 );
|
||||
|
||||
initstate[0] = mm512_bcast_i64( 0xdfa9bf2cdfa9bf2c );
|
||||
initstate[1] = mm512_bcast_i64( 0xb72074d4b72074d4 );
|
||||
initstate[2] = mm512_bcast_i64( 0x6bb011226bb01122 );
|
||||
initstate[3] = mm512_bcast_i64( 0xd338e869d338e869 );
|
||||
initstate[4] = mm512_bcast_i64( 0xaa3ff126aa3ff126 );
|
||||
initstate[5] = mm512_bcast_i64( 0x475bbf30475bbf30 );
|
||||
initstate[6] = mm512_bcast_i64( 0x8fd52e5b8fd52e5b );
|
||||
initstate[7] = mm512_bcast_i64( 0x9f75c9ad9f75c9ad );
|
||||
initstate[0] = _mm512_set1_epi64( 0xdfa9bf2cdfa9bf2c );
|
||||
initstate[1] = _mm512_set1_epi64( 0xb72074d4b72074d4 );
|
||||
initstate[2] = _mm512_set1_epi64( 0x6bb011226bb01122 );
|
||||
initstate[3] = _mm512_set1_epi64( 0xd338e869d338e869 );
|
||||
initstate[4] = _mm512_set1_epi64( 0xaa3ff126aa3ff126 );
|
||||
initstate[5] = _mm512_set1_epi64( 0x475bbf30475bbf30 );
|
||||
initstate[6] = _mm512_set1_epi64( 0x8fd52e5b8fd52e5b );
|
||||
initstate[7] = _mm512_set1_epi64( 0x9f75c9ad9f75c9ad );
|
||||
|
||||
sha256_16way_transform_le( midstate1, vdata, initstate );
|
||||
|
||||
@@ -118,31 +118,31 @@ int scanhash_sha256dt_8way( struct work *work, const uint32_t max_nonce,
|
||||
__m256i *noncev = vdata + 19;
|
||||
const int thr_id = mythr->id;
|
||||
const bool bench = opt_benchmark;
|
||||
const __m256i last_byte = m256_const1_32( 0x80000000 );
|
||||
const __m256i eight = m256_const1_32( 8 );
|
||||
const __m256i last_byte = _mm256_set1_epi32( 0x80000000 );
|
||||
const __m256i eight = _mm256_set1_epi32( 8 );
|
||||
|
||||
for ( int i = 0; i < 19; i++ )
|
||||
vdata[i] = mm256_bcast_i32( pdata[i] );
|
||||
vdata[i] = _mm256_set1_epi32( pdata[i] );
|
||||
|
||||
*noncev = _mm256_set_epi32( n+ 7, n+ 6, n+ 5, n+ 4, n+ 3, n+ 2, n+1, n );
|
||||
|
||||
vdata[16+4] = last_byte;
|
||||
memset_zero_256( vdata+16 + 5, 10 );
|
||||
vdata[16+15] = mm256_bcast_i32( 0x480 );
|
||||
vdata[16+15] = _mm256_set1_epi32( 0x480 );
|
||||
|
||||
block[ 8] = last_byte;
|
||||
memset_zero_256( block + 9, 6 );
|
||||
block[15] = mm256_bcast_i32( 0x300 );
|
||||
block[15] = _mm256_set1_epi32( 0x300 );
|
||||
|
||||
// initialize state
|
||||
initstate[0] = mm256_bcast_i64( 0xdfa9bf2cdfa9bf2c );
|
||||
initstate[1] = mm256_bcast_i64( 0xb72074d4b72074d4 );
|
||||
initstate[2] = mm256_bcast_i64( 0x6bb011226bb01122 );
|
||||
initstate[3] = mm256_bcast_i64( 0xd338e869d338e869 );
|
||||
initstate[4] = mm256_bcast_i64( 0xaa3ff126aa3ff126 );
|
||||
initstate[5] = mm256_bcast_i64( 0x475bbf30475bbf30 );
|
||||
initstate[6] = mm256_bcast_i64( 0x8fd52e5b8fd52e5b );
|
||||
initstate[7] = mm256_bcast_i64( 0x9f75c9ad9f75c9ad );
|
||||
initstate[0] = _mm256_set1_epi64x( 0xdfa9bf2cdfa9bf2c );
|
||||
initstate[1] = _mm256_set1_epi64x( 0xb72074d4b72074d4 );
|
||||
initstate[2] = _mm256_set1_epi64x( 0x6bb011226bb01122 );
|
||||
initstate[3] = _mm256_set1_epi64x( 0xd338e869d338e869 );
|
||||
initstate[4] = _mm256_set1_epi64x( 0xaa3ff126aa3ff126 );
|
||||
initstate[5] = _mm256_set1_epi64x( 0x475bbf30475bbf30 );
|
||||
initstate[6] = _mm256_set1_epi64x( 0x8fd52e5b8fd52e5b );
|
||||
initstate[7] = _mm256_set1_epi64x( 0x9f75c9ad9f75c9ad );
|
||||
|
||||
sha256_8way_transform_le( midstate1, vdata, initstate );
|
||||
|
||||
@@ -198,31 +198,31 @@ int scanhash_sha256dt_4way( struct work *work, const uint32_t max_nonce,
|
||||
__m128i *noncev = vdata + 19;
|
||||
const int thr_id = mythr->id;
|
||||
const bool bench = opt_benchmark;
|
||||
const __m128i last_byte = m128_const1_32( 0x80000000 );
|
||||
const __m128i four = m128_const1_32( 4 );
|
||||
const __m128i last_byte = _mm_set1_epi32( 0x80000000 );
|
||||
const __m128i four = _mm_set1_epi32( 4 );
|
||||
|
||||
for ( int i = 0; i < 19; i++ )
|
||||
vdata[i] = mm128_bcast_i32( pdata[i] );
|
||||
vdata[i] = _mm_set1_epi32( pdata[i] );
|
||||
|
||||
*noncev = _mm_set_epi32( n+ 3, n+ 2, n+1, n );
|
||||
|
||||
vdata[16+4] = last_byte;
|
||||
memset_zero_128( vdata+16 + 5, 10 );
|
||||
vdata[16+15] = mm128_bcast_i32( 0x480 );
|
||||
vdata[16+15] = _mm_set1_epi32( 0x480 );
|
||||
|
||||
block[ 8] = last_byte;
|
||||
memset_zero_128( block + 9, 6 );
|
||||
block[15] = mm128_bcast_i32( 0x300 );
|
||||
block[15] = _mm_set1_epi32( 0x300 );
|
||||
|
||||
// initialize state
|
||||
initstate[0] = mm128_bcast_i64( 0xdfa9bf2cdfa9bf2c );
|
||||
initstate[1] = mm128_bcast_i64( 0xb72074d4b72074d4 );
|
||||
initstate[2] = mm128_bcast_i64( 0x6bb011226bb01122 );
|
||||
initstate[3] = mm128_bcast_i64( 0xd338e869d338e869 );
|
||||
initstate[4] = mm128_bcast_i64( 0xaa3ff126aa3ff126 );
|
||||
initstate[5] = mm128_bcast_i64( 0x475bbf30475bbf30 );
|
||||
initstate[6] = mm128_bcast_i64( 0x8fd52e5b8fd52e5b );
|
||||
initstate[7] = mm128_bcast_i64( 0x9f75c9ad9f75c9ad );
|
||||
initstate[0] = _mm_set1_epi64x( 0xdfa9bf2cdfa9bf2c );
|
||||
initstate[1] = _mm_set1_epi64x( 0xb72074d4b72074d4 );
|
||||
initstate[2] = _mm_set1_epi64x( 0x6bb011226bb01122 );
|
||||
initstate[3] = _mm_set1_epi64x( 0xd338e869d338e869 );
|
||||
initstate[4] = _mm_set1_epi64x( 0xaa3ff126aa3ff126 );
|
||||
initstate[5] = _mm_set1_epi64x( 0x475bbf30475bbf30 );
|
||||
initstate[6] = _mm_set1_epi64x( 0x8fd52e5b8fd52e5b );
|
||||
initstate[7] = _mm_set1_epi64x( 0x9f75c9ad9f75c9ad );
|
||||
|
||||
// hash first 64 bytes of data
|
||||
sha256_4way_transform_le( midstate, vdata, initstate );
|
||||
|
||||
@@ -155,14 +155,14 @@ sha512_8way_round( sha512_8way_context *ctx, __m512i *in, __m512i r[8] )
|
||||
}
|
||||
else
|
||||
{
|
||||
A = m512_const1_64( 0x6A09E667F3BCC908 );
|
||||
B = m512_const1_64( 0xBB67AE8584CAA73B );
|
||||
C = m512_const1_64( 0x3C6EF372FE94F82B );
|
||||
D = m512_const1_64( 0xA54FF53A5F1D36F1 );
|
||||
E = m512_const1_64( 0x510E527FADE682D1 );
|
||||
F = m512_const1_64( 0x9B05688C2B3E6C1F );
|
||||
G = m512_const1_64( 0x1F83D9ABFB41BD6B );
|
||||
H = m512_const1_64( 0x5BE0CD19137E2179 );
|
||||
A = _mm512_set1_epi64( 0x6A09E667F3BCC908 );
|
||||
B = _mm512_set1_epi64( 0xBB67AE8584CAA73B );
|
||||
C = _mm512_set1_epi64( 0x3C6EF372FE94F82B );
|
||||
D = _mm512_set1_epi64( 0xA54FF53A5F1D36F1 );
|
||||
E = _mm512_set1_epi64( 0x510E527FADE682D1 );
|
||||
F = _mm512_set1_epi64( 0x9B05688C2B3E6C1F );
|
||||
G = _mm512_set1_epi64( 0x1F83D9ABFB41BD6B );
|
||||
H = _mm512_set1_epi64( 0x5BE0CD19137E2179 );
|
||||
}
|
||||
|
||||
for ( i = 0; i < 80; i += 8 )
|
||||
@@ -191,14 +191,14 @@ sha512_8way_round( sha512_8way_context *ctx, __m512i *in, __m512i r[8] )
|
||||
else
|
||||
{
|
||||
ctx->initialized = true;
|
||||
r[0] = _mm512_add_epi64( A, m512_const1_64( 0x6A09E667F3BCC908 ) );
|
||||
r[1] = _mm512_add_epi64( B, m512_const1_64( 0xBB67AE8584CAA73B ) );
|
||||
r[2] = _mm512_add_epi64( C, m512_const1_64( 0x3C6EF372FE94F82B ) );
|
||||
r[3] = _mm512_add_epi64( D, m512_const1_64( 0xA54FF53A5F1D36F1 ) );
|
||||
r[4] = _mm512_add_epi64( E, m512_const1_64( 0x510E527FADE682D1 ) );
|
||||
r[5] = _mm512_add_epi64( F, m512_const1_64( 0x9B05688C2B3E6C1F ) );
|
||||
r[6] = _mm512_add_epi64( G, m512_const1_64( 0x1F83D9ABFB41BD6B ) );
|
||||
r[7] = _mm512_add_epi64( H, m512_const1_64( 0x5BE0CD19137E2179 ) );
|
||||
r[0] = _mm512_add_epi64( A, _mm512_set1_epi64( 0x6A09E667F3BCC908 ) );
|
||||
r[1] = _mm512_add_epi64( B, _mm512_set1_epi64( 0xBB67AE8584CAA73B ) );
|
||||
r[2] = _mm512_add_epi64( C, _mm512_set1_epi64( 0x3C6EF372FE94F82B ) );
|
||||
r[3] = _mm512_add_epi64( D, _mm512_set1_epi64( 0xA54FF53A5F1D36F1 ) );
|
||||
r[4] = _mm512_add_epi64( E, _mm512_set1_epi64( 0x510E527FADE682D1 ) );
|
||||
r[5] = _mm512_add_epi64( F, _mm512_set1_epi64( 0x9B05688C2B3E6C1F ) );
|
||||
r[6] = _mm512_add_epi64( G, _mm512_set1_epi64( 0x1F83D9ABFB41BD6B ) );
|
||||
r[7] = _mm512_add_epi64( H, _mm512_set1_epi64( 0x5BE0CD19137E2179 ) );
|
||||
}
|
||||
}
|
||||
|
||||
@@ -239,11 +239,8 @@ void sha512_8way_close( sha512_8way_context *sc, void *dst )
|
||||
unsigned ptr;
|
||||
const int buf_size = 128;
|
||||
const int pad = buf_size - 16;
|
||||
const __m512i shuff_bswap64 = m512_const_64(
|
||||
0x38393a3b3c3d3e3f, 0x3031323334353637,
|
||||
0x28292a2b2c2d2e2f, 0x2021222324252627,
|
||||
0x18191a1b1c1d1e1f, 0x1011121314151617,
|
||||
0x08090a0b0c0d0e0f, 0x0001020304050607 );
|
||||
const __m512i shuff_bswap64 = mm512_bcast_m128( _mm_set_epi64x(
|
||||
0x08090a0b0c0d0e0f, 0x0001020304050607 ) );
|
||||
|
||||
ptr = (unsigned)sc->count & (buf_size - 1U);
|
||||
sc->buf[ ptr>>3 ] = m512_const1_64( 0x80 );
|
||||
@@ -440,10 +437,8 @@ void sha512_4way_close( sha512_4way_context *sc, void *dst )
|
||||
unsigned ptr;
|
||||
const int buf_size = 128;
|
||||
const int pad = buf_size - 16;
|
||||
const __m256i shuff_bswap64 = m256_const_64( 0x18191a1b1c1d1e1f,
|
||||
0x1011121314151617,
|
||||
0x08090a0b0c0d0e0f,
|
||||
0x0001020304050607 );
|
||||
const __m256i shuff_bswap64 = mm256_bcast_m128( _mm_set_epi64x(
|
||||
0x08090a0b0c0d0e0f, 0x0001020304050607 ) );
|
||||
|
||||
ptr = (unsigned)sc->count & (buf_size - 1U);
|
||||
sc->buf[ ptr>>3 ] = m256_const1_64( 0x80 );
|
||||
|
||||
@@ -15,14 +15,14 @@ static void sha512256d_8way_init( sha512_8way_context *ctx )
|
||||
{
|
||||
ctx->count = 0;
|
||||
ctx->initialized = true;
|
||||
ctx->val[0] = mm512_bcast_i64( 0x22312194FC2BF72C );
|
||||
ctx->val[1] = mm512_bcast_i64( 0x9F555FA3C84C64C2 );
|
||||
ctx->val[2] = mm512_bcast_i64( 0x2393B86B6F53B151 );
|
||||
ctx->val[3] = mm512_bcast_i64( 0x963877195940EABD );
|
||||
ctx->val[4] = mm512_bcast_i64( 0x96283EE2A88EFFE3 );
|
||||
ctx->val[5] = mm512_bcast_i64( 0xBE5E1E2553863992 );
|
||||
ctx->val[6] = mm512_bcast_i64( 0x2B0199FC2C85B8AA );
|
||||
ctx->val[7] = mm512_bcast_i64( 0x0EB72DDC81C52CA2 );
|
||||
ctx->val[0] = _mm512_set1_epi64( 0x22312194FC2BF72C );
|
||||
ctx->val[1] = _mm512_set1_epi64( 0x9F555FA3C84C64C2 );
|
||||
ctx->val[2] = _mm512_set1_epi64( 0x2393B86B6F53B151 );
|
||||
ctx->val[3] = _mm512_set1_epi64( 0x963877195940EABD );
|
||||
ctx->val[4] = _mm512_set1_epi64( 0x96283EE2A88EFFE3 );
|
||||
ctx->val[5] = _mm512_set1_epi64( 0xBE5E1E2553863992 );
|
||||
ctx->val[6] = _mm512_set1_epi64( 0x2B0199FC2C85B8AA );
|
||||
ctx->val[7] = _mm512_set1_epi64( 0x0EB72DDC81C52CA2 );
|
||||
}
|
||||
|
||||
int scanhash_sha512256d_8way( struct work *work, uint32_t max_nonce,
|
||||
@@ -42,7 +42,7 @@ int scanhash_sha512256d_8way( struct work *work, uint32_t max_nonce,
|
||||
__m512i *noncev = (__m512i*)vdata + 9;
|
||||
const int thr_id = mythr->id;
|
||||
const bool bench = opt_benchmark;
|
||||
const __m512i eight = mm512_bcast_i64( 0x0000000800000000 );
|
||||
const __m512i eight = _mm512_set1_epi64( 0x0000000800000000 );
|
||||
|
||||
mm512_bswap32_intrlv80_8x64( vdata, pdata );
|
||||
*noncev = mm512_intrlv_blend_32(
|
||||
@@ -83,14 +83,14 @@ static void sha512256d_4way_init( sha512_4way_context *ctx )
|
||||
{
|
||||
ctx->count = 0;
|
||||
ctx->initialized = true;
|
||||
ctx->val[0] = mm256_bcast_i64( 0x22312194FC2BF72C );
|
||||
ctx->val[1] = mm256_bcast_i64( 0x9F555FA3C84C64C2 );
|
||||
ctx->val[2] = mm256_bcast_i64( 0x2393B86B6F53B151 );
|
||||
ctx->val[3] = mm256_bcast_i64( 0x963877195940EABD );
|
||||
ctx->val[4] = mm256_bcast_i64( 0x96283EE2A88EFFE3 );
|
||||
ctx->val[5] = mm256_bcast_i64( 0xBE5E1E2553863992 );
|
||||
ctx->val[6] = mm256_bcast_i64( 0x2B0199FC2C85B8AA );
|
||||
ctx->val[7] = mm256_bcast_i64( 0x0EB72DDC81C52CA2 );
|
||||
ctx->val[0] = _mm256_set1_epi64x( 0x22312194FC2BF72C );
|
||||
ctx->val[1] = _mm256_set1_epi64x( 0x9F555FA3C84C64C2 );
|
||||
ctx->val[2] = _mm256_set1_epi64x( 0x2393B86B6F53B151 );
|
||||
ctx->val[3] = _mm256_set1_epi64x( 0x963877195940EABD );
|
||||
ctx->val[4] = _mm256_set1_epi64x( 0x96283EE2A88EFFE3 );
|
||||
ctx->val[5] = _mm256_set1_epi64x( 0xBE5E1E2553863992 );
|
||||
ctx->val[6] = _mm256_set1_epi64x( 0x2B0199FC2C85B8AA );
|
||||
ctx->val[7] = _mm256_set1_epi64x( 0x0EB72DDC81C52CA2 );
|
||||
}
|
||||
|
||||
int scanhash_sha512256d_4way( struct work *work, uint32_t max_nonce,
|
||||
@@ -110,7 +110,7 @@ int scanhash_sha512256d_4way( struct work *work, uint32_t max_nonce,
|
||||
__m256i *noncev = (__m256i*)vdata + 9;
|
||||
const int thr_id = mythr->id;
|
||||
const bool bench = opt_benchmark;
|
||||
const __m256i four = mm256_bcast_i64( 0x0000000400000000 );
|
||||
const __m256i four = _mm256_set1_epi64x( 0x0000000400000000 );
|
||||
|
||||
mm256_bswap32_intrlv80_4x64( vdata, pdata );
|
||||
*noncev = mm256_intrlv_blend_32(
|
||||
|
||||
@@ -276,6 +276,11 @@ do { \
|
||||
A1 = _mm256_xor_si256( A1, _mm256_set1_epi32( Whigh ) ); \
|
||||
} while (0)
|
||||
|
||||
#define mm256_swap512_256( v1, v2 ) \
|
||||
v1 = _mm256_xor_si256( v1, v2 ); \
|
||||
v2 = _mm256_xor_si256( v1, v2 ); \
|
||||
v1 = _mm256_xor_si256( v1, v2 );
|
||||
|
||||
#define SWAP_BC8 \
|
||||
do { \
|
||||
mm256_swap512_256( B0, C0 ); \
|
||||
@@ -866,6 +871,11 @@ do { \
|
||||
A1 = _mm_xor_si128( A1, _mm_set1_epi32( Whigh ) ); \
|
||||
} while (0)
|
||||
|
||||
#define mm128_swap256_128( v1, v2 ) \
|
||||
v1 = _mm_xor_si128( v1, v2 ); \
|
||||
v2 = _mm_xor_si128( v1, v2 ); \
|
||||
v1 = _mm_xor_si128( v1, v2 );
|
||||
|
||||
#define SWAP_BC \
|
||||
do { \
|
||||
mm128_swap256_128( B0, C0 ); \
|
||||
|
||||
@@ -18,14 +18,6 @@ static const uint32_t IV512[] =
|
||||
0xE275EADE, 0x502D9FCD, 0xB9357178, 0x022A4B9A
|
||||
};
|
||||
|
||||
/*
|
||||
#define mm256_ror2x256hi_1x32( a, b ) \
|
||||
_mm256_blend_epi32( mm256_shuflr128_32( a ), \
|
||||
mm256_shuflr128_32( b ), 0x88 )
|
||||
*/
|
||||
|
||||
//#define mm256_ror2x256hi_1x32( a, b ) _mm256_alignr_epi8( b, a, 4 )
|
||||
|
||||
#if defined(__VAES__)
|
||||
|
||||
#define mm256_aesenc_2x128( x, k ) \
|
||||
@@ -34,8 +26,9 @@ static const uint32_t IV512[] =
|
||||
#else
|
||||
|
||||
#define mm256_aesenc_2x128( x, k ) \
|
||||
mm256_concat_128( _mm_aesenc_si128( mm128_extr_hi128_256( x ), k ), \
|
||||
_mm_aesenc_si128( mm128_extr_lo128_256( x ), k ) )
|
||||
_mm256_inserti128_si256( _mm256_castsi128_si256( \
|
||||
_mm_aesenc_si128( _mm256_castsi256_si128( x ), k ) ), \
|
||||
_mm_aesenc_si128( _mm256_extracti128_si256( x, 1 ), k ), 1 )
|
||||
|
||||
#endif
|
||||
|
||||
@@ -257,10 +250,10 @@ void shavite512_2way_init( shavite512_2way_context *ctx )
|
||||
__m256i *h = (__m256i*)ctx->h;
|
||||
__m128i *iv = (__m128i*)IV512;
|
||||
|
||||
h[0] = m256_const1_128( iv[0] );
|
||||
h[1] = m256_const1_128( iv[1] );
|
||||
h[2] = m256_const1_128( iv[2] );
|
||||
h[3] = m256_const1_128( iv[3] );
|
||||
h[0] = mm256_bcast_m128( iv[0] );
|
||||
h[1] = mm256_bcast_m128( iv[1] );
|
||||
h[2] = mm256_bcast_m128( iv[2] );
|
||||
h[3] = mm256_bcast_m128( iv[3] );
|
||||
|
||||
ctx->ptr = 0;
|
||||
ctx->count0 = 0;
|
||||
@@ -320,7 +313,7 @@ void shavite512_2way_close( shavite512_2way_context *ctx, void *dst )
|
||||
uint32_t vp = ctx->ptr>>5;
|
||||
|
||||
// Terminating byte then zero pad
|
||||
casti_m256i( buf, vp++ ) = m256_const1_i128( 0x0000000000000080 );
|
||||
casti_m256i( buf, vp++ ) = mm256_bcast128lo_64( 0x0000000000000080 );
|
||||
|
||||
// Zero pad full vectors up to count
|
||||
for ( ; vp < 6; vp++ )
|
||||
@@ -334,9 +327,9 @@ void shavite512_2way_close( shavite512_2way_context *ctx, void *dst )
|
||||
count.u32[2] = ctx->count2;
|
||||
count.u32[3] = ctx->count3;
|
||||
|
||||
casti_m256i( buf, 6 ) = m256_const1_128(
|
||||
casti_m256i( buf, 6 ) = mm256_bcast_m128(
|
||||
_mm_insert_epi16( m128_zero, count.u16[0], 7 ) );
|
||||
casti_m256i( buf, 7 ) = m256_const1_128( _mm_set_epi16(
|
||||
casti_m256i( buf, 7 ) = mm256_bcast_m128( _mm_set_epi16(
|
||||
0x0200, count.u16[7], count.u16[6], count.u16[5],
|
||||
count.u16[4], count.u16[3], count.u16[2], count.u16[1] ) );
|
||||
|
||||
@@ -400,19 +393,19 @@ void shavite512_2way_update_close( shavite512_2way_context *ctx, void *dst,
|
||||
|
||||
if ( vp == 0 ) // empty buf, xevan.
|
||||
{
|
||||
casti_m256i( buf, 0 ) = m256_const1_i128( 0x0000000000000080 );
|
||||
casti_m256i( buf, 0 ) = mm256_bcast128lo_64( 0x0000000000000080 );
|
||||
memset_zero_256( (__m256i*)buf + 1, 5 );
|
||||
ctx->count0 = ctx->count1 = ctx->count2 = ctx->count3 = 0;
|
||||
}
|
||||
else // half full buf, everyone else.
|
||||
{
|
||||
casti_m256i( buf, vp++ ) = m256_const1_i128( 0x0000000000000080 );
|
||||
casti_m256i( buf, vp++ ) = mm256_bcast128lo_64( 0x0000000000000080 );
|
||||
memset_zero_256( (__m256i*)buf + vp, 6 - vp );
|
||||
}
|
||||
|
||||
casti_m256i( buf, 6 ) = m256_const1_128(
|
||||
casti_m256i( buf, 6 ) = mm256_bcast_m128(
|
||||
_mm_insert_epi16( m128_zero, count.u16[0], 7 ) );
|
||||
casti_m256i( buf, 7 ) = m256_const1_128( _mm_set_epi16(
|
||||
casti_m256i( buf, 7 ) = mm256_bcast_m128( _mm_set_epi16(
|
||||
0x0200, count.u16[7], count.u16[6], count.u16[5],
|
||||
count.u16[4], count.u16[3], count.u16[2], count.u16[1] ) );
|
||||
|
||||
@@ -430,10 +423,10 @@ void shavite512_2way_full( shavite512_2way_context *ctx, void *dst,
|
||||
__m256i *h = (__m256i*)ctx->h;
|
||||
__m128i *iv = (__m128i*)IV512;
|
||||
|
||||
h[0] = m256_const1_128( iv[0] );
|
||||
h[1] = m256_const1_128( iv[1] );
|
||||
h[2] = m256_const1_128( iv[2] );
|
||||
h[3] = m256_const1_128( iv[3] );
|
||||
h[0] = mm256_bcast_m128( iv[0] );
|
||||
h[1] = mm256_bcast_m128( iv[1] );
|
||||
h[2] = mm256_bcast_m128( iv[2] );
|
||||
h[3] = mm256_bcast_m128( iv[3] );
|
||||
|
||||
ctx->ptr =
|
||||
ctx->count0 =
|
||||
@@ -490,19 +483,19 @@ void shavite512_2way_full( shavite512_2way_context *ctx, void *dst,
|
||||
|
||||
if ( vp == 0 ) // empty buf, xevan.
|
||||
{
|
||||
casti_m256i( buf, 0 ) = m256_const1_i128( 0x0000000000000080 );
|
||||
casti_m256i( buf, 0 ) = mm256_bcast128lo_64( 0x0000000000000080 );
|
||||
memset_zero_256( (__m256i*)buf + 1, 5 );
|
||||
ctx->count0 = ctx->count1 = ctx->count2 = ctx->count3 = 0;
|
||||
}
|
||||
else // half full buf, everyone else.
|
||||
{
|
||||
casti_m256i( buf, vp++ ) = m256_const1_i128( 0x0000000000000080 );
|
||||
casti_m256i( buf, vp++ ) = mm256_bcast128lo_64( 0x0000000000000080 );
|
||||
memset_zero_256( (__m256i*)buf + vp, 6 - vp );
|
||||
}
|
||||
|
||||
casti_m256i( buf, 6 ) = m256_const1_128(
|
||||
casti_m256i( buf, 6 ) = mm256_bcast_m128(
|
||||
_mm_insert_epi16( m128_zero, count.u16[0], 7 ) );
|
||||
casti_m256i( buf, 7 ) = m256_const1_128( _mm_set_epi16(
|
||||
casti_m256i( buf, 7 ) = mm256_bcast_m128( _mm_set_epi16(
|
||||
0x0200, count.u16[7], count.u16[6], count.u16[5],
|
||||
count.u16[4], count.u16[3], count.u16[2], count.u16[1] ) );
|
||||
|
||||
|
||||
@@ -227,10 +227,10 @@ void shavite512_4way_init( shavite512_4way_context *ctx )
|
||||
__m512i *h = (__m512i*)ctx->h;
|
||||
__m128i *iv = (__m128i*)IV512;
|
||||
|
||||
h[0] = m512_const1_128( iv[0] );
|
||||
h[1] = m512_const1_128( iv[1] );
|
||||
h[2] = m512_const1_128( iv[2] );
|
||||
h[3] = m512_const1_128( iv[3] );
|
||||
h[0] = mm512_bcast_m128( iv[0] );
|
||||
h[1] = mm512_bcast_m128( iv[1] );
|
||||
h[2] = mm512_bcast_m128( iv[2] );
|
||||
h[3] = mm512_bcast_m128( iv[3] );
|
||||
|
||||
ctx->ptr = 0;
|
||||
ctx->count0 = 0;
|
||||
@@ -290,7 +290,7 @@ void shavite512_4way_close( shavite512_4way_context *ctx, void *dst )
|
||||
uint32_t vp = ctx->ptr>>6;
|
||||
|
||||
// Terminating byte then zero pad
|
||||
casti_m512i( buf, vp++ ) = m512_const1_i128( 0x0000000000000080 );
|
||||
casti_m512i( buf, vp++ ) = mm512_bcast128lo_64( 0x0000000000000080 );
|
||||
|
||||
// Zero pad full vectors up to count
|
||||
for ( ; vp < 6; vp++ )
|
||||
@@ -304,9 +304,9 @@ void shavite512_4way_close( shavite512_4way_context *ctx, void *dst )
|
||||
count.u32[2] = ctx->count2;
|
||||
count.u32[3] = ctx->count3;
|
||||
|
||||
casti_m512i( buf, 6 ) = m512_const1_128(
|
||||
casti_m512i( buf, 6 ) = mm512_bcast_m128(
|
||||
_mm_insert_epi16( m128_zero, count.u16[0], 7 ) );
|
||||
casti_m512i( buf, 7 ) = m512_const1_128( _mm_set_epi16(
|
||||
casti_m512i( buf, 7 ) = mm512_bcast_m128( _mm_set_epi16(
|
||||
0x0200, count.u16[7], count.u16[6], count.u16[5],
|
||||
count.u16[4], count.u16[3], count.u16[2], count.u16[1] ) );
|
||||
|
||||
@@ -370,19 +370,19 @@ void shavite512_4way_update_close( shavite512_4way_context *ctx, void *dst,
|
||||
|
||||
if ( vp == 0 ) // empty buf, xevan.
|
||||
{
|
||||
casti_m512i( buf, 0 ) = m512_const1_i128( 0x0000000000000080 );
|
||||
casti_m512i( buf, 0 ) = mm512_bcast128lo_64( 0x0000000000000080 );
|
||||
memset_zero_512( (__m512i*)buf + 1, 5 );
|
||||
ctx->count0 = ctx->count1 = ctx->count2 = ctx->count3 = 0;
|
||||
}
|
||||
else // half full buf, everyone else.
|
||||
{
|
||||
casti_m512i( buf, vp++ ) = m512_const1_i128( 0x0000000000000080 );
|
||||
casti_m512i( buf, vp++ ) = mm512_bcast128lo_64( 0x0000000000000080 );
|
||||
memset_zero_512( (__m512i*)buf + vp, 6 - vp );
|
||||
}
|
||||
|
||||
casti_m512i( buf, 6 ) = m512_const1_128(
|
||||
casti_m512i( buf, 6 ) = mm512_bcast_m128(
|
||||
_mm_insert_epi16( m128_zero, count.u16[0], 7 ) );
|
||||
casti_m512i( buf, 7 ) = m512_const1_128( _mm_set_epi16(
|
||||
casti_m512i( buf, 7 ) = mm512_bcast_m128( _mm_set_epi16(
|
||||
0x0200, count.u16[7], count.u16[6], count.u16[5],
|
||||
count.u16[4], count.u16[3], count.u16[2], count.u16[1] ) );
|
||||
|
||||
@@ -401,10 +401,10 @@ void shavite512_4way_full( shavite512_4way_context *ctx, void *dst,
|
||||
__m512i *h = (__m512i*)ctx->h;
|
||||
__m128i *iv = (__m128i*)IV512;
|
||||
|
||||
h[0] = m512_const1_128( iv[0] );
|
||||
h[1] = m512_const1_128( iv[1] );
|
||||
h[2] = m512_const1_128( iv[2] );
|
||||
h[3] = m512_const1_128( iv[3] );
|
||||
h[0] = mm512_bcast_m128( iv[0] );
|
||||
h[1] = mm512_bcast_m128( iv[1] );
|
||||
h[2] = mm512_bcast_m128( iv[2] );
|
||||
h[3] = mm512_bcast_m128( iv[3] );
|
||||
|
||||
ctx->ptr =
|
||||
ctx->count0 =
|
||||
@@ -461,19 +461,19 @@ void shavite512_4way_full( shavite512_4way_context *ctx, void *dst,
|
||||
|
||||
if ( vp == 0 ) // empty buf, xevan.
|
||||
{
|
||||
casti_m512i( buf, 0 ) = m512_const1_i128( 0x0000000000000080 );
|
||||
casti_m512i( buf, 0 ) = mm512_bcast128lo_64( 0x0000000000000080 );
|
||||
memset_zero_512( (__m512i*)buf + 1, 5 );
|
||||
ctx->count0 = ctx->count1 = ctx->count2 = ctx->count3 = 0;
|
||||
}
|
||||
else // half full buf, everyone else.
|
||||
{
|
||||
casti_m512i( buf, vp++ ) = m512_const1_i128( 0x0000000000000080 );
|
||||
casti_m512i( buf, vp++ ) = mm512_bcast128lo_64( 0x0000000000000080 );
|
||||
memset_zero_512( (__m512i*)buf + vp, 6 - vp );
|
||||
}
|
||||
|
||||
casti_m512i( buf, 6 ) = m512_const1_128(
|
||||
casti_m512i( buf, 6 ) = mm512_bcast_m128(
|
||||
_mm_insert_epi16( m128_zero, count.u16[0], 7 ) );
|
||||
casti_m512i( buf, 7 ) = m512_const1_128( _mm_set_epi16(
|
||||
casti_m512i( buf, 7 ) = mm512_bcast_m128( _mm_set_epi16(
|
||||
0x0200, count.u16[7], count.u16[6], count.u16[5],
|
||||
count.u16[4], count.u16[3], count.u16[2], count.u16[1] ) );
|
||||
|
||||
|
||||
@@ -484,14 +484,7 @@ do { \
|
||||
#undef BUTTERFLY_0
|
||||
#undef BUTTERFLY_N
|
||||
|
||||
// twiddle is hard coded T[0] = m512_const2_64( {128,64,32,16}, {8,4,2,1} )
|
||||
// Multiply by twiddle factors
|
||||
// X(6) = _mm512_mullo_epi16( X(6), m512_const2_64( 0x0080004000200010,
|
||||
// 0x0008000400020001 );
|
||||
// X(5) = _mm512_mullo_epi16( X(5), m512_const2_64( 0xffdc0008ffef0004,
|
||||
// 0x00780002003c0001 );
|
||||
|
||||
|
||||
X(6) = _mm512_mullo_epi16( X(6), FFT64_Twiddle4w[0].v512 );
|
||||
X(5) = _mm512_mullo_epi16( X(5), FFT64_Twiddle4w[1].v512 );
|
||||
X(4) = _mm512_mullo_epi16( X(4), FFT64_Twiddle4w[2].v512 );
|
||||
|
||||
@@ -74,6 +74,10 @@
|
||||
_mm256_or_si256( _mm256_and_si256( x, y ), \
|
||||
_mm256_andnot_si256( x, z ) )
|
||||
|
||||
#define mm256_rol_var_32( v, c ) \
|
||||
_mm256_or_si256( _mm256_slli_epi32( v, c ), \
|
||||
_mm256_srli_epi32( v, 32-(c) ) )
|
||||
|
||||
void sm3_8way_compress( __m256i *digest, __m256i *block )
|
||||
{
|
||||
__m256i W[68], W1[64];
|
||||
@@ -251,6 +255,9 @@ void sm3_8way_close( void *cc, void *dst )
|
||||
_mm_andnot_si128( x, z ) )
|
||||
|
||||
|
||||
#define mm128_rol_var_32( v, c ) \
|
||||
_mm_or_si128( _mm_slli_epi32( v, c ), _mm_srli_epi32( v, 32-(c) ) )
|
||||
|
||||
void sm3_4way_compress( __m128i *digest, __m128i *block )
|
||||
{
|
||||
__m128i W[68], W1[64];
|
||||
|
||||
Reference in New Issue
Block a user